Update spaCy for thinc 8.0.0 (#4920)

* Add load_from_config function * Add train_from_config script * Merge configs and expose via spacy.config * Fix script * Suggest create_evaluation_callback * Hard-code for NER * Fix errors * Register command * Add TODO * Update train-from-config todos * Fix imports * Allow delayed setting of parser model nr_class * Get train-from-config working * Tidy up and fix scores and printing * Hide traceback if cancelled * Fix weighted score formatting * Fix score formatting * Make output_path optional * Add Tok2Vec component * Tidy up and add tok2vec_tensors * Add option to copy docs in nlp.update * Copy docs in nlp.update * Adjust nlp.update() for set_annotations * Don't shuffle pipes in nlp.update, decruft * Support set_annotations arg in component update * Support set_annotations in parser update * Add get_gradients method * Add get_gradients to parser * Update errors.py * Fix problems caused by merge * Add _link_components method in nlp * Add concept of 'listeners' and ControlledModel * Support optional attributes arg in ControlledModel * Try having tok2vec component in pipeline * Fix tok2vec component * Fix config * Fix tok2vec * Update for Example * Update for Example * Update config * Add eg2doc util * Update and add schemas/types * Update schemas * Fix nlp.update * Fix tagger * Remove hacks from train-from-config * Remove hard-coded config str * Calculate loss in tok2vec component * Tidy up and use function signatures instead of models * Support union types for registry models * Minor cleaning in Language.update * Make ControlledModel specifically Tok2VecListener * Fix train_from_config * Fix tok2vec * Tidy up * Add function for bilstm tok2vec * Fix type * Fix syntax * Fix pytorch optimizer * Add example configs * Update for thinc describe changes * Update for Thinc changes * Update for dropout/sgd changes * Update for dropout/sgd changes * Unhack gradient update * Work on refactoring _ml * Remove _ml.py module * WIP upgrade cli scripts for thinc * Move some _ml stuff to util * Import link_vectors from util * Update train_from_config * Import from util * Import from util * Temporarily add ml.component_models module * Move ml methods * Move typedefs * Update load vectors * Update gitignore * Move imports * Add PrecomputableAffine * Fix imports * Fix imports * Fix imports * Fix missing imports * Update CLI scripts * Update spacy.language * Add stubs for building the models * Update model definition * Update create_default_optimizer * Fix import * Fix comment * Update imports in tests * Update imports in spacy.cli * Fix import * fix obsolete thinc imports * update srsly pin * from thinc to ml_datasets for example data such as imdb * update ml_datasets pin * using STATE.vectors * small fix * fix Sentencizer.pipe * black formatting * rename Affine to Linear as in thinc * set validate explicitely to True * rename with_square_sequences to with_list2padded * rename with_flatten to with_list2array * chaining layernorm * small fixes * revert Optimizer import * build_nel_encoder with new thinc style * fixes using model's get and set methods * Tok2Vec in component models, various fixes * fix up legacy tok2vec code * add model initialize calls * add in build_tagger_model * small fixes * setting model dims * fixes for ParserModel * various small fixes * initialize thinc Models * fixes * consistent naming of window_size * fixes, removing set_dropout * work around Iterable issue * remove legacy tok2vec * util fix * fix forward function of tok2vec listener * more fixes * trying to fix PrecomputableAffine (not succesful yet) * alloc instead of allocate * add morphologizer * rename residual * rename fixes * Fix predict function * Update parser and parser model * fixing few more tests * Fix precomputable affine * Update component model * Update parser model * Move backprop padding to own function, for test * Update test * Fix p. affine * Update NEL * build_bow_text_classifier and extract_ngrams * Fix parser init * Fix test add label * add build_simple_cnn_text_classifier * Fix parser init * Set gpu off by default in example * Fix tok2vec listener * Fix parser model * Small fixes * small fix for PyTorchLSTM parameters * revert my_compounding hack (iterable fixed now) * fix biLSTM * Fix uniqued * PyTorchRNNWrapper fix * small fixes * use helper function to calculate cosine loss * small fixes for build_simple_cnn_text_classifier * putting dropout default at 0.0 to ensure the layer gets built * using thinc util's set_dropout_rate * moving layer normalization inside of maxout definition to optimize dropout * temp debugging in NEL * fixed NEL model by using init defaults ! * fixing after set_dropout_rate refactor * proper fix * fix test_update_doc after refactoring optimizers in thinc * Add CharacterEmbed layer * Construct tagger Model * Add missing import * Remove unused stuff * Work on textcat * fix test (again :)) after optimizer refactor * fixes to allow reading Tagger from_disk without overwriting dimensions * don't build the tok2vec prematuraly * fix CharachterEmbed init * CharacterEmbed fixes * Fix CharacterEmbed architecture * fix imports * renames from latest thinc update * one more rename * add initialize calls where appropriate * fix parser initialization * Update Thinc version * Fix errors, auto-format and tidy up imports * Fix validation * fix if bias is cupy array * revert for now * ensure it's a numpy array before running bp in ParserStepModel * no reason to call require_gpu twice * use CupyOps.to_numpy instead of cupy directly * fix initialize of ParserModel * remove unnecessary import * fixes for CosineDistance * fix device renaming * use refactored loss functions (Thinc PR 251) * overfitting test for tagger * experimental settings for the tagger: avoid zero-init and subword normalization * clean up tagger overfitting test * use previous default value for nP * remove toy config * bringing layernorm back (had a bug - fixed in thinc) * revert setting nP explicitly * remove setting default in constructor * restore values as they used to be * add overfitting test for NER * add overfitting test for dep parser * add overfitting test for textcat * fixing init for linear (previously affine) * larger eps window for textcat * ensure doc is not None * Require newer thinc * Make float check vaguer * Slop the textcat overfit test more * Fix textcat test * Fix exclusive classes for textcat * fix after renaming of alloc methods * fixing renames and mandatory arguments (staticvectors WIP) * upgrade to thinc==8.0.0.dev3 * refer to vocab.vectors directly instead of its name * rename alpha to learn_rate * adding hashembed and staticvectors dropout * upgrade to thinc 8.0.0.dev4 * add name back to avoid warning W020 * thinc dev4 * update srsly * using thinc 8.0.0a0 ! Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> Co-authored-by: Ines Montani <ines@ines.io>
2025-11-27 05:15:43 +03:00 · 2020-01-29 17:06:46 +01:00 · 2020-01-29 17:06:46 +01:00 · 569cc98982
commit 569cc98982
parent 06b251dd1e
70 changed files with 2141 additions and 1675 deletions
--- a/.gitignore
+++ b/.gitignore
@ -39,6 +39,7 @@ __pycache__/
 .env*
 .~env/
 .venv
+env3.6/
 venv/
 .dev
 .denv
@ -111,3 +112,6 @@ Desktop.ini

 # Pycharm project files
 *.idea
+
+# IPython
+.ipynb_checkpoints/
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@ -4,12 +4,12 @@ from random import shuffle
 import logging
 import numpy as np

-from spacy._ml import zero_init, create_default_optimizer
-from spacy.cli.pretrain import get_cossim_loss
-
-from thinc.v2v import Model
+from thinc.model import Model
 from thinc.api import chain
-from thinc.neural._classes.affine import Affine
+from thinc.loss import CosineDistance
+from thinc.layers import Linear
+
+from spacy.util import create_default_optimizer

 logger = logging.getLogger(__name__)

@ -34,6 +34,7 @@ class EntityEncoder:
        self.input_dim = input_dim
        self.desc_width = desc_width
        self.epochs = epochs
+        self.distance = CosineDistance(ignore_zeros=True, normalize=False)

    def apply_encoder(self, description_list):
        if self.encoder is None:
@ -132,21 +133,17 @@ class EntityEncoder:
    def _build_network(self, orig_width, hidden_with):
        with Model.define_operators({">>": chain}):
            # very simple encoder-decoder model
-            self.encoder = Affine(hidden_with, orig_width)
-            self.model = self.encoder >> zero_init(
-                Affine(orig_width, hidden_with, drop_factor=0.0)
-            )
-        self.sgd = create_default_optimizer(self.model.ops)
+            self.encoder = Linear(hidden_with, orig_width)
+            # TODO: removed the zero_init here - is oK?
+            self.model = self.encoder >> Linear(orig_width, hidden_with)
+        self.sgd = create_default_optimizer()

    def _update(self, vectors):
+        truths = self.model.ops.asarray(vectors)
        predictions, bp_model = self.model.begin_update(
-            np.asarray(vectors), drop=self.DROP
+            truths, drop=self.DROP
        )
-        loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
+        d_scores, loss = self.distance(predictions, truths)
        bp_model(d_scores, sgd=self.sgd)
        return loss / len(vectors)

-    @staticmethod
-    def _get_loss(golds, scores):
-        loss, gradients = get_cossim_loss(scores, golds)
-        return loss, gradients
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@ -103,7 +103,7 @@ def main(
    logger.info("STEP 3: Creating and training an Entity Linking pipe")

    el_pipe = nlp.create_pipe(
-        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
+        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors,
                                      "labels_discard": labels_discard}
    )
    el_pipe.set_kb(kb)
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@ -14,7 +14,7 @@ pip install keras==2.0.9

 Compatible with: spaCy v2.0.0+
 """
-
+import ml_datasets
 import plac
 import random
 import pathlib
@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
 from keras.layers import LSTM, Dense, Embedding, Bidirectional
 from keras.layers import TimeDistributed
 from keras.optimizers import Adam
-import thinc.extra.datasets
 from spacy.compat import pickle
 import spacy

@ -224,7 +223,7 @@ def main(
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    if train_dir is None or dev_dir is None:
-        imdb_data = thinc.extra.datasets.imdb()
+        imdb_data = ml_datasets.imdb()
    if is_runtime:
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -0,0 +1,63 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = 0
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.tagger.model]
+@architectures = "tagger_model.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "transition_based_parser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "hash_embed_bilstm.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+embed_size = 2000
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@ -0,0 +1,65 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = -1
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.tagger.model]
+@architectures = "tagger_model.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "transition_based_parser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "hash_embed_cnn.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@ -13,9 +13,10 @@ Prerequisites: pip install joblib
 from __future__ import print_function, unicode_literals

 from pathlib import Path
+
+import ml_datasets
 from joblib import Parallel, delayed
 from functools import partial
-import thinc.extra.datasets
 import plac
 import spacy
 from spacy.util import minibatch
@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
        output_dir.mkdir()
    # load and pre-process the IMBD dataset
    print("Loading IMDB data...")
-    data, _ = thinc.extra.datasets.imdb()
+    data, _ = ml_datasets.imdb()
    texts, _ = zip(*data[-limit:])
    print("Processing texts...")
    partitions = minibatch(texts, size=batch_size)
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@ -16,16 +16,18 @@ the development labels, after all --- only the unlabelled text.
 import plac
 import tqdm
 import random
+
+import ml_datasets
+
 import spacy
-import thinc.extra.datasets
 from spacy.util import minibatch, use_gpu, compounding
-from spacy._ml import Tok2Vec
 from spacy.pipeline import TextCategorizer
+from spacy.ml.tok2vec import Tok2Vec
 import numpy


 def load_texts(limit=0):
-    train, dev = thinc.extra.datasets.imdb()
+    train, dev = ml_datasets.imdb()
    train_texts, train_labels = zip(*train)
    dev_texts, dev_labels = zip(*train)
    train_texts = list(train_texts)
@ -41,7 +43,7 @@ def load_texts(limit=0):
 def load_textcat_data(limit=0):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
-    train_data, eval_data = thinc.extra.datasets.imdb()
+    train_data, eval_data = ml_datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
@ -63,17 +65,15 @@ def prefer_gpu():


 def build_textcat_model(tok2vec, nr_class, width):
-    from thinc.v2v import Model, Softmax, Maxout
-    from thinc.api import flatten_add_lengths, chain
-    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
-    from thinc.misc import Residual, LayerNorm
-    from spacy._ml import logistic, zero_init
+    from thinc.model import Model
+    from thinc.layers import Softmax, chain, reduce_mean
+    from thinc.layers import list2ragged

    with Model.define_operators({">>": chain}):
        model = (
            tok2vec
-            >> flatten_add_lengths
-            >> Pooling(mean_pool)
+            >> list2ragged()
+            >> reduce_mean()
            >> Softmax(nr_class, width)
        )
    model.tok2vec = tok2vec
@ -81,7 +81,7 @@ def build_textcat_model(tok2vec, nr_class, width):


 def block_gradients(model):
-    from thinc.api import wrap
+    from thinc.api import wrap  # TODO FIX

    def forward(X, drop=0.0):
        Y, _ = model.begin_update(X, drop=drop)
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@ -58,7 +58,7 @@ def main(model_name, unlabelled_loc):
    # yet, but I'm getting weird results from Adam. Try commenting out the
    # nlp.update(), and using Adam -- you'll find the models drift apart.
    # I guess Adam is losing precision, introducing gradient noise?
-    optimizer.alpha = 0.1
+    optimizer.learn_rate = 0.1
    optimizer.b1 = 0.0
    optimizer.b2 = 0.0

--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -17,7 +17,7 @@ import plac
 import random
 from pathlib import Path

-from spacy.symbols import PERSON
+import srsly
 from spacy.vocab import Vocab

 import spacy
@ -68,7 +68,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    vocab = Vocab().from_disk(vocab_path)
    # create blank Language class with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
-    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
+    nlp.vocab.vectors.name = "nel_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
@ -93,7 +93,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
-    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
+    # Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
@ -117,6 +117,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
+
        for itn in range(n_iter):
            random.shuffle(TRAIN_DOCS)
            losses = {}
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -10,10 +10,11 @@ see the documentation:
 Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function
+
+import ml_datasets
 import plac
 import random
 from pathlib import Path
-import thinc.extra.datasets

 import spacy
 from spacy.util import minibatch, compounding
@ -115,7 +116,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
 def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
-    train_data, _ = thinc.extra.datasets.imdb()
+    train_data, _ = ml_datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,17 +1,20 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc==7.4.0.dev0
+thinc==8.0.0a0
 blis>=0.4.0,<0.5.0
+ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
-srsly>=0.1.0,<1.1.0
+srsly>=2.0.0,<3.0.0
 catalogue>=0.0.7,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 plac>=0.9.6,<1.2.0
 tqdm>=4.38.0,<5.0.0
+# Optional dependencies
+jsonschema>=2.6.0,<3.1.0
 pydantic>=1.0.0,<2.0.0
 # Development dependencies
 cython>=0.25
--- a/setup.cfg
+++ b/setup.cfg
@ -35,16 +35,16 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc==7.4.0.dev0
+    thinc==8.0.0a0
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc==7.4.0.dev0
+    thinc==8.0.0a0
    blis>=0.4.0,<0.5.0
    wasabi>=0.4.0,<1.1.0
-    srsly>=0.1.0,<1.1.0
+    srsly>=2.0.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
    # Third-party dependencies
    setuptools
--- a/spacy/init.py
+++ b/spacy/init.py
@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

 # These are imported as part of the API
-from thinc.neural.util import prefer_gpu, require_gpu
+from thinc.util import prefer_gpu, require_gpu

 from . import pipeline
 from .cli.info import info as cli_info
@ -21,6 +21,9 @@ if sys.maxunicode == 65535:
    raise SystemError(Errors.E130)


+config = registry
+
+
 def load(name, **overrides):
    depr_path = overrides.get("path")
    if depr_path not in (True, False, None):
--- a/spacy/main.py
+++ b/spacy/main.py
@ -4,12 +4,14 @@ if __name__ == "__main__":
    from wasabi import msg
    from spacy.cli import download, link, info, package, train, pretrain, convert
    from spacy.cli import init_model, profile, evaluate, validate, debug_data
+    from spacy.cli import train_from_config_cli

    commands = {
        "download": download,
        "link": link,
        "info": info,
        "train": train,
+        "train-from-config": train_from_config_cli,
        "pretrain": pretrain,
        "debug-data": debug_data,
        "evaluate": evaluate,
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -1,982 +0,0 @@
-import numpy
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
-from thinc.t2t import ExtractWindow, ParametricAttention
-from thinc.t2v import Pooling, sum_pool, mean_pool
-from thinc.i2v import HashEmbed
-from thinc.misc import Residual, FeatureExtracter
-from thinc.misc import LayerNorm as LN
-from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.api import with_getitem, flatten_add_lengths
-from thinc.api import uniqued, wrap, noop
-from thinc.linear.linear import LinearModel
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
-from thinc.neural.optimizers import Adam
-
-from thinc import describe
-from thinc.describe import Dimension, Synapses, Biases, Gradient
-from thinc.neural._classes.affine import _set_dimensions_if_needed
-import thinc.extra.load_nlp
-
-from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
-from .errors import Errors, user_warning, Warnings
-from . import util
-from . import ml as new_ml
-from .ml import _legacy_tok2vec
-
-
-VECTORS_KEY = "spacy_pretrained_vectors"
-# Backwards compatibility with <2.2.2
-USE_MODEL_REGISTRY_TOK2VEC = False
-
-
-def cosine(vec1, vec2):
-    xp = get_array_module(vec1)
-    norm1 = xp.linalg.norm(vec1)
-    norm2 = xp.linalg.norm(vec2)
-    if norm1 == 0.0 or norm2 == 0.0:
-        return 0
-    else:
-        return vec1.dot(vec2) / (norm1 * norm2)
-
-
-def create_default_optimizer(ops, **cfg):
-    learn_rate = util.env_opt("learn_rate", 0.001)
-    beta1 = util.env_opt("optimizer_B1", 0.9)
-    beta2 = util.env_opt("optimizer_B2", 0.999)
-    eps = util.env_opt("optimizer_eps", 1e-8)
-    L2 = util.env_opt("L2_penalty", 1e-6)
-    max_grad_norm = util.env_opt("grad_norm_clip", 1.0)
-    optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
-    optimizer.max_grad_norm = max_grad_norm
-    optimizer.device = ops.device
-    return optimizer
-
-
-@layerize
-def _flatten_add_lengths(seqs, pad=0, drop=0.0):
-    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths, pad=pad)
-
-    X = ops.flatten(seqs, pad=pad)
-    return (X, lengths), finish_update
-
-
-def _zero_init(model):
-    def _zero_init_impl(self, *args, **kwargs):
-        self.W.fill(0)
-
-    model.on_init_hooks.append(_zero_init_impl)
-    if model.W is not None:
-        model.W.fill(0.0)
-    return model
-
-
-def with_cpu(ops, model):
-    """Wrap a model that should run on CPU, transferring inputs and outputs
-    as necessary."""
-    model.to_cpu()
-
-    def with_cpu_forward(inputs, drop=0.0):
-        cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
-        gpu_outputs = _to_device(ops, cpu_outputs)
-
-        def with_cpu_backprop(d_outputs, sgd=None):
-            cpu_d_outputs = _to_cpu(d_outputs)
-            return backprop(cpu_d_outputs, sgd=sgd)
-
-        return gpu_outputs, with_cpu_backprop
-
-    return wrap(with_cpu_forward, model)
-
-
-def _to_cpu(X):
-    if isinstance(X, numpy.ndarray):
-        return X
-    elif isinstance(X, tuple):
-        return tuple([_to_cpu(x) for x in X])
-    elif isinstance(X, list):
-        return [_to_cpu(x) for x in X]
-    elif hasattr(X, "get"):
-        return X.get()
-    else:
-        return X
-
-
-def _to_device(ops, X):
-    if isinstance(X, tuple):
-        return tuple([_to_device(ops, x) for x in X])
-    elif isinstance(X, list):
-        return [_to_device(ops, x) for x in X]
-    else:
-        return ops.asarray(X)
-
-
-class extract_ngrams(Model):
-    def __init__(self, ngram_size, attr=LOWER):
-        Model.__init__(self)
-        self.ngram_size = ngram_size
-        self.attr = attr
-
-    def begin_update(self, docs, drop=0.0):
-        batch_keys = []
-        batch_vals = []
-        for doc in docs:
-            unigrams = doc.to_array([self.attr])
-            ngrams = [unigrams]
-            for n in range(2, self.ngram_size + 1):
-                ngrams.append(self.ops.ngrams(n, unigrams))
-            keys = self.ops.xp.concatenate(ngrams)
-            keys, vals = self.ops.xp.unique(keys, return_counts=True)
-            batch_keys.append(keys)
-            batch_vals.append(vals)
-        # The dtype here matches what thinc is expecting -- which differs per
-        # platform (by int definition). This should be fixed once the problem
-        # is fixed on Thinc's side.
-        lengths = self.ops.asarray(
-            [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
-        )
-        batch_keys = self.ops.xp.concatenate(batch_keys)
-        batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
-        return (batch_keys, batch_vals, lengths), None
-
-
-@describe.on_data(
-    _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
-)
-@describe.attributes(
-    nI=Dimension("Input size"),
-    nF=Dimension("Number of features"),
-    nO=Dimension("Output size"),
-    nP=Dimension("Maxout pieces"),
-    W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
-    b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
-    pad=Synapses(
-        "Pad",
-        lambda obj: (1, obj.nF, obj.nO, obj.nP),
-        lambda M, ops: ops.normal_init(M, 1.0),
-    ),
-    d_W=Gradient("W"),
-    d_pad=Gradient("pad"),
-    d_b=Gradient("b"),
-)
-class PrecomputableAffine(Model):
-    def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
-        Model.__init__(self, **kwargs)
-        self.nO = nO
-        self.nP = nP
-        self.nI = nI
-        self.nF = nF
-
-    def begin_update(self, X, drop=0.0):
-        Yf = self.ops.gemm(
-            X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
-        )
-        Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
-        Yf = self._add_padding(Yf)
-
-        def backward(dY_ids, sgd=None):
-            dY, ids = dY_ids
-            dY, ids = self._backprop_padding(dY, ids)
-            Xf = X[ids]
-            Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
-
-            self.d_b += dY.sum(axis=0)
-            dY = dY.reshape((dY.shape[0], self.nO * self.nP))
-
-            Wopfi = self.W.transpose((1, 2, 0, 3))
-            Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
-            Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
-            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
-
-            # Reuse the buffer
-            dWopfi = Wopfi
-            dWopfi.fill(0.0)
-            self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
-            dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
-            # (o, p, f, i) --> (f, o, p, i)
-            self.d_W += dWopfi.transpose((2, 0, 1, 3))
-
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return dXf.reshape((dXf.shape[0], self.nF, self.nI))
-
-        return Yf, backward
-
-    def _add_padding(self, Yf):
-        Yf_padded = self.ops.xp.vstack((self.pad, Yf))
-        return Yf_padded
-
-    def _backprop_padding(self, dY, ids):
-        # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        mask = ids < 0.0
-        mask = mask.sum(axis=1)
-        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
-        self.d_pad += d_pad.sum(axis=0)
-        return dY, ids
-
-    @staticmethod
-    def init_weights(model):
-        """This is like the 'layer sequential unit variance', but instead
-        of taking the actual inputs, we randomly generate whitened data.
-
-        Why's this all so complicated? We have a huge number of inputs,
-        and the maxout unit makes guessing the dynamics tricky. Instead
-        we set the maxout weights to values that empirically result in
-        whitened outputs given whitened inputs.
-        """
-        if (model.W ** 2).sum() != 0.0:
-            return
-        ops = model.ops
-        xp = ops.xp
-        ops.normal_init(model.W, model.nF * model.nI, inplace=True)
-
-        ids = ops.allocate((5000, model.nF), dtype="f")
-        ids += xp.random.uniform(0, 1000, ids.shape)
-        ids = ops.asarray(ids, dtype="i")
-        tokvecs = ops.allocate((5000, model.nI), dtype="f")
-        tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-            tokvecs.shape
-        )
-
-        def predict(ids, tokvecs):
-            # nS ids. nW tokvecs. Exclude the padding array.
-            hiddens = model(tokvecs[:-1])  # (nW, f, o, p)
-            vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
-            # need nS vectors
-            hiddens = hiddens.reshape(
-                (hiddens.shape[0] * model.nF, model.nO * model.nP)
-            )
-            model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-            vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
-            vectors += model.b
-            vectors = model.ops.asarray(vectors)
-            if model.nP >= 2:
-                return model.ops.maxout(vectors)[0]
-            else:
-                return vectors * (vectors >= 0)
-
-        tol_var = 0.01
-        tol_mean = 0.01
-        t_max = 10
-        t_i = 0
-        for t_i in range(t_max):
-            acts1 = predict(ids, tokvecs)
-            var = model.ops.xp.var(acts1)
-            mean = model.ops.xp.mean(acts1)
-            if abs(var - 1.0) >= tol_var:
-                model.W /= model.ops.xp.sqrt(var)
-            elif abs(mean) >= tol_mean:
-                model.b -= mean
-            else:
-                break
-
-
-def link_vectors_to_models(vocab):
-    vectors = vocab.vectors
-    if vectors.name is None:
-        vectors.name = VECTORS_KEY
-        if vectors.data.size != 0:
-            user_warning(Warnings.W020.format(shape=vectors.data.shape))
-    ops = Model.ops
-    for word in vocab:
-        if word.orth in vectors.key2row:
-            word.rank = vectors.key2row[word.orth]
-        else:
-            word.rank = 0
-    data = ops.asarray(vectors.data)
-    # Set an entry here, so that vectors are accessed by StaticVectors
-    # (unideal, I know)
-    key = (ops.device, vectors.name)
-    if key in thinc.extra.load_nlp.VECTORS:
-        if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
-            # This is a hack to avoid the problem in #3853. Maybe we should
-            # print a warning as well?
-            old_name = vectors.name
-            new_name = f"{vectors.name}_{data.shape[0]}"
-            user_warning(Warnings.W019.format(old=old_name, new=new_name))
-            vectors.name = new_name
-            key = (ops.device, vectors.name)
-    thinc.extra.load_nlp.VECTORS[key] = data
-
-
-def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
-    import torch.nn
-    from thinc.api import with_square_sequences
-    from thinc.extra.wrappers import PyTorchWrapperRNN
-
-    if depth == 0:
-        return layerize(noop())
-    model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
-    return with_square_sequences(PyTorchWrapperRNN(model))
-
-
-def Tok2Vec(width, embed_size, **kwargs):
-    if not USE_MODEL_REGISTRY_TOK2VEC:
-        # Preserve prior tok2vec for backwards compat, in v2.2.2
-        return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
-    pretrained_vectors = kwargs.get("pretrained_vectors", None)
-    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
-    subword_features = kwargs.get("subword_features", True)
-    char_embed = kwargs.get("char_embed", False)
-    conv_depth = kwargs.get("conv_depth", 4)
-    bilstm_depth = kwargs.get("bilstm_depth", 0)
-    conv_window = kwargs.get("conv_window", 1)
-
-    cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
-
-    doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
-    if char_embed:
-        embed_cfg = {
-            "arch": "spacy.CharacterEmbed.v1",
-            "config": {
-                "width": 64,
-                "chars": 6,
-                "@mix": {
-                    "arch": "spacy.LayerNormalizedMaxout.v1",
-                    "config": {"width": width, "pieces": 3},
-                },
-                "@embed_features": None,
-            },
-        }
-    else:
-        embed_cfg = {
-            "arch": "spacy.MultiHashEmbed.v1",
-            "config": {
-                "width": width,
-                "rows": embed_size,
-                "columns": cols,
-                "use_subwords": subword_features,
-                "@pretrained_vectors": None,
-                "@mix": {
-                    "arch": "spacy.LayerNormalizedMaxout.v1",
-                    "config": {"width": width, "pieces": 3},
-                },
-            },
-        }
-        if pretrained_vectors:
-            embed_cfg["config"]["@pretrained_vectors"] = {
-                "arch": "spacy.PretrainedVectors.v1",
-                "config": {
-                    "vectors_name": pretrained_vectors,
-                    "width": width,
-                    "column": cols.index("ID"),
-                },
-            }
-    if cnn_maxout_pieces >= 2:
-        cnn_cfg = {
-            "arch": "spacy.MaxoutWindowEncoder.v1",
-            "config": {
-                "width": width,
-                "window_size": conv_window,
-                "pieces": cnn_maxout_pieces,
-                "depth": conv_depth,
-            },
-        }
-    else:
-        cnn_cfg = {
-            "arch": "spacy.MishWindowEncoder.v1",
-            "config": {"width": width, "window_size": conv_window, "depth": conv_depth},
-        }
-    bilstm_cfg = {
-        "arch": "spacy.TorchBiLSTMEncoder.v1",
-        "config": {"width": width, "depth": bilstm_depth},
-    }
-    if conv_depth == 0 and bilstm_depth == 0:
-        encode_cfg = {}
-    elif conv_depth >= 1 and bilstm_depth >= 1:
-        encode_cfg = {
-            "arch": "thinc.FeedForward.v1",
-            "config": {"children": [cnn_cfg, bilstm_cfg]},
-        }
-    elif conv_depth >= 1:
-        encode_cfg = cnn_cfg
-    else:
-        encode_cfg = bilstm_cfg
-    config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
-    return new_ml.Tok2Vec(config)
-
-
-def reapply(layer, n_times):
-    def reapply_fwd(X, drop=0.0):
-        backprops = []
-        for i in range(n_times):
-            Y, backprop = layer.begin_update(X, drop=drop)
-            X = Y
-            backprops.append(backprop)
-
-        def reapply_bwd(dY, sgd=None):
-            dX = None
-            for backprop in reversed(backprops):
-                dY = backprop(dY, sgd=sgd)
-                if dX is None:
-                    dX = dY
-                else:
-                    dX += dY
-            return dX
-
-        return Y, reapply_bwd
-
-    return wrap(reapply_fwd, layer)
-
-
-def asarray(ops, dtype):
-    def forward(X, drop=0.0):
-        return ops.asarray(X, dtype=dtype), None
-
-    return layerize(forward)
-
-
-def _divide_array(X, size):
-    parts = []
-    index = 0
-    while index < len(X):
-        parts.append(X[index : index + size])
-        index += size
-    return parts
-
-
-def get_col(idx):
-    if idx < 0:
-        raise IndexError(Errors.E066.format(value=idx))
-
-    def forward(X, drop=0.0):
-        if isinstance(X, numpy.ndarray):
-            ops = NumpyOps()
-        else:
-            ops = CupyOps()
-        output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
-
-        def backward(y, sgd=None):
-            dX = ops.allocate(X.shape)
-            dX[:, idx] += y
-            return dX
-
-        return output, backward
-
-    return layerize(forward)
-
-
-def doc2feats(cols=None):
-    if cols is None:
-        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-
-    def forward(docs, drop=0.0):
-        feats = []
-        for doc in docs:
-            feats.append(doc.to_array(cols))
-        return feats, None
-
-    model = layerize(forward)
-    model.cols = cols
-    return model
-
-
-def print_shape(prefix):
-    def forward(X, drop=0.0):
-        return X, lambda dX, **kwargs: dX
-
-    return layerize(forward)
-
-
-@layerize
-def get_token_vectors(tokens_attrs_vectors, drop=0.0):
-    tokens, attrs, vectors = tokens_attrs_vectors
-
-    def backward(d_output, sgd=None):
-        return (tokens, d_output)
-
-    return vectors, backward
-
-
-@layerize
-def logistic(X, drop=0.0):
-    xp = get_array_module(X)
-    if not isinstance(X, xp.ndarray):
-        X = xp.asarray(X)
-    # Clip to range (-10, 10)
-    X = xp.minimum(X, 10.0, X)
-    X = xp.maximum(X, -10.0, X)
-    Y = 1.0 / (1.0 + xp.exp(-X))
-
-    def logistic_bwd(dY, sgd=None):
-        dX = dY * (Y * (1 - Y))
-        return dX
-
-    return Y, logistic_bwd
-
-
-def zero_init(model):
-    def _zero_init_impl(self, X, y):
-        self.W.fill(0)
-
-    model.on_data_hooks.append(_zero_init_impl)
-    return model
-
-
-def getitem(i):
-    def getitem_fwd(X, drop=0.0):
-        return X[i], None
-
-    return layerize(getitem_fwd)
-
-
-@describe.attributes(
-    W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
-)
-class MultiSoftmax(Affine):
-    """Neural network layer that predicts several multi-class attributes at once.
-    For instance, we might predict one class with 6 variables, and another with 5.
-    We predict the 11 neurons required for this, and then softmax them such
-    that columns 0-6 make a probability distribution and coumns 6-11 make another.
-    """
-
-    name = "multisoftmax"
-
-    def __init__(self, out_sizes, nI=None, **kwargs):
-        Model.__init__(self, **kwargs)
-        self.out_sizes = out_sizes
-        self.nO = sum(out_sizes)
-        self.nI = nI
-
-    def predict(self, input__BI):
-        output__BO = self.ops.affine(self.W, self.b, input__BI)
-        i = 0
-        for out_size in self.out_sizes:
-            self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
-            i += out_size
-        return output__BO
-
-    def begin_update(self, input__BI, drop=0.0):
-        output__BO = self.predict(input__BI)
-
-        def finish_update(grad__BO, sgd=None):
-            self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
-            self.d_b += grad__BO.sum(axis=0)
-            grad__BI = self.ops.gemm(grad__BO, self.W)
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return grad__BI
-
-        return output__BO, finish_update
-
-
-def build_tagger_model(nr_class, **cfg):
-    embed_size = util.env_opt("embed_size", 2000)
-    if "token_vector_width" in cfg:
-        token_vector_width = cfg["token_vector_width"]
-    else:
-        token_vector_width = util.env_opt("token_vector_width", 96)
-    pretrained_vectors = cfg.get("pretrained_vectors")
-    subword_features = cfg.get("subword_features", True)
-    with Model.define_operators({">>": chain, "+": add}):
-        if "tok2vec" in cfg:
-            tok2vec = cfg["tok2vec"]
-        else:
-            tok2vec = Tok2Vec(
-                token_vector_width,
-                embed_size,
-                subword_features=subword_features,
-                pretrained_vectors=pretrained_vectors,
-            )
-        softmax = with_flatten(Softmax(nr_class, token_vector_width))
-        model = tok2vec >> softmax
-    model.nI = None
-    model.tok2vec = tok2vec
-    model.softmax = softmax
-    return model
-
-
-def build_morphologizer_model(class_nums, **cfg):
-    embed_size = util.env_opt("embed_size", 7000)
-    if "token_vector_width" in cfg:
-        token_vector_width = cfg["token_vector_width"]
-    else:
-        token_vector_width = util.env_opt("token_vector_width", 128)
-    pretrained_vectors = cfg.get("pretrained_vectors")
-    char_embed = cfg.get("char_embed", True)
-    with Model.define_operators({">>": chain, "+": add, "**": clone}):
-        if "tok2vec" in cfg:
-            tok2vec = cfg["tok2vec"]
-        else:
-            tok2vec = Tok2Vec(
-                token_vector_width,
-                embed_size,
-                char_embed=char_embed,
-                pretrained_vectors=pretrained_vectors,
-            )
-        softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
-        softmax.out_sizes = class_nums
-        model = tok2vec >> softmax
-    model.nI = None
-    model.tok2vec = tok2vec
-    model.softmax = softmax
-    return model
-
-
-@layerize
-def SpacyVectors(docs, drop=0.0):
-    batch = []
-    for doc in docs:
-        indices = numpy.zeros((len(doc),), dtype="i")
-        for i, word in enumerate(doc):
-            if word.orth in doc.vocab.vectors.key2row:
-                indices[i] = doc.vocab.vectors.key2row[word.orth]
-            else:
-                indices[i] = 0
-        vectors = doc.vocab.vectors.data[indices]
-        batch.append(vectors)
-    return batch, None
-
-
-def build_text_classifier(nr_class, width=64, **cfg):
-    depth = cfg.get("depth", 2)
-    nr_vector = cfg.get("nr_vector", 5000)
-    pretrained_dims = cfg.get("pretrained_dims", 0)
-    with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
-        if cfg.get("low_data") and pretrained_dims:
-            model = (
-                SpacyVectors
-                >> flatten_add_lengths
-                >> with_getitem(0, Affine(width, pretrained_dims))
-                >> ParametricAttention(width)
-                >> Pooling(sum_pool)
-                >> Residual(ReLu(width, width)) ** 2
-                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
-                >> logistic
-            )
-            return model
-
-        lower = HashEmbed(width, nr_vector, column=1)
-        prefix = HashEmbed(width // 2, nr_vector, column=2)
-        suffix = HashEmbed(width // 2, nr_vector, column=3)
-        shape = HashEmbed(width // 2, nr_vector, column=4)
-
-        trained_vectors = FeatureExtracter(
-            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
-        ) >> with_flatten(
-            uniqued(
-                (lower | prefix | suffix | shape)
-                >> LN(Maxout(width, width + (width // 2) * 3)),
-                column=0,
-            )
-        )
-
-        if pretrained_dims:
-            static_vectors = SpacyVectors >> with_flatten(
-                Affine(width, pretrained_dims)
-            )
-            # TODO Make concatenate support lists
-            vectors = concatenate_lists(trained_vectors, static_vectors)
-            vectors_width = width * 2
-        else:
-            vectors = trained_vectors
-            vectors_width = width
-            static_vectors = None
-        tok2vec = vectors >> with_flatten(
-            LN(Maxout(width, vectors_width))
-            >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
-            pad=depth,
-        )
-        cnn_model = (
-            tok2vec
-            >> flatten_add_lengths
-            >> ParametricAttention(width)
-            >> Pooling(sum_pool)
-            >> Residual(zero_init(Maxout(width, width)))
-            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
-        )
-
-        linear_model = build_bow_text_classifier(
-            nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
-        )
-        if cfg.get("exclusive_classes"):
-            output_layer = Softmax(nr_class, nr_class * 2)
-        else:
-            output_layer = (
-                zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
-            )
-        model = (linear_model | cnn_model) >> output_layer
-        model.tok2vec = chain(tok2vec, flatten)
-    model.nO = nr_class
-    model.lsuv = False
-    return model
-
-
-def build_bow_text_classifier(
-    nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
-):
-    with Model.define_operators({">>": chain}):
-        model = with_cpu(
-            Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
-        )
-        if not no_output_layer:
-            model = model >> (cpu_softmax if exclusive_classes else logistic)
-    model.nO = nr_class
-    return model
-
-
-@layerize
-def cpu_softmax(X, drop=0.0):
-    ops = NumpyOps()
-
-    def cpu_softmax_backward(dY, sgd=None):
-        return dY
-
-    return ops.softmax(X), cpu_softmax_backward
-
-
-def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
-    """
-    Build a simple CNN text classifier, given a token-to-vector model as inputs.
-    If exclusive_classes=True, a softmax non-linearity is applied, so that the
-    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
-    is applied instead, so that outputs are in the range [0, 1].
-    """
-    with Model.define_operators({">>": chain}):
-        if exclusive_classes:
-            output_layer = Softmax(nr_class, tok2vec.nO)
-        else:
-            output_layer = (
-                zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
-            )
-        model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
-    model.tok2vec = chain(tok2vec, flatten)
-    model.nO = nr_class
-    return model
-
-
-def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
-    if "entity_width" not in cfg:
-        raise ValueError(Errors.E144.format(param="entity_width"))
-
-    conv_depth = cfg.get("conv_depth", 2)
-    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
-    pretrained_vectors = cfg.get("pretrained_vectors", None)
-    context_width = cfg.get("entity_width")
-
-    with Model.define_operators({">>": chain, "**": clone}):
-        # context encoder
-        tok2vec = Tok2Vec(
-            width=hidden_width,
-            embed_size=embed_width,
-            pretrained_vectors=pretrained_vectors,
-            cnn_maxout_pieces=cnn_maxout_pieces,
-            subword_features=True,
-            conv_depth=conv_depth,
-            bilstm_depth=0,
-        )
-
-        model = (
-            tok2vec
-            >> flatten_add_lengths
-            >> Pooling(mean_pool)
-            >> Residual(zero_init(Maxout(hidden_width, hidden_width)))
-            >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
-        )
-
-        model.tok2vec = tok2vec
-        model.nO = context_width
-    return model
-
-
-@layerize
-def flatten(seqs, drop=0.0):
-    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths, pad=0)
-
-    X = ops.flatten(seqs, pad=0)
-    return X, finish_update
-
-
-def concatenate_lists(*layers, **kwargs):  # pragma: no cover
-    """Compose two or more models `f`, `g`, etc, such that their outputs are
-    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
-    """
-    if not layers:
-        return noop()
-    drop_factor = kwargs.get("drop_factor", 1.0)
-    ops = layers[0].ops
-    layers = [chain(layer, flatten) for layer in layers]
-    concat = concatenate(*layers)
-
-    def concatenate_lists_fwd(Xs, drop=0.0):
-        if drop is not None:
-            drop *= drop_factor
-        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
-        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
-        ys = ops.unflatten(flat_y, lengths)
-
-        def concatenate_lists_bwd(d_ys, sgd=None):
-            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
-        return ys, concatenate_lists_bwd
-
-    model = wrap(concatenate_lists_fwd, concat)
-    return model
-
-
-def masked_language_model(vocab, model, mask_prob=0.15):
-    """Convert a model into a BERT-style masked language model"""
-
-    random_words = _RandomWords(vocab)
-
-    def mlm_forward(docs, drop=0.0):
-        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
-        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
-        output, backprop = model.begin_update(docs, drop=drop)
-
-        def mlm_backward(d_output, sgd=None):
-            d_output *= 1 - mask
-            return backprop(d_output, sgd=sgd)
-
-        return output, mlm_backward
-
-    return wrap(mlm_forward, model)
-
-
-class _RandomWords(object):
-    def __init__(self, vocab):
-        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
-        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
-        self.words = self.words[:10000]
-        self.probs = self.probs[:10000]
-        self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
-        self.probs /= self.probs.sum()
-        self._cache = []
-
-    def next(self):
-        if not self._cache:
-            self._cache.extend(
-                numpy.random.choice(len(self.words), 10000, p=self.probs)
-            )
-        index = self._cache.pop()
-        return self.words[index]
-
-
-def _apply_mask(docs, random_words, mask_prob=0.15):
-    # This needs to be here to avoid circular imports
-    from .tokens.doc import Doc
-
-    N = sum(len(doc) for doc in docs)
-    mask = numpy.random.uniform(0.0, 1.0, (N,))
-    mask = mask >= mask_prob
-    i = 0
-    masked_docs = []
-    for doc in docs:
-        words = []
-        for token in doc:
-            if not mask[i]:
-                word = _replace_word(token.text, random_words)
-            else:
-                word = token.text
-            words.append(word)
-            i += 1
-        spaces = [bool(w.whitespace_) for w in doc]
-        # NB: If you change this implementation to instead modify
-        # the docs in place, take care that the IDs reflect the original
-        # words. Currently we use the original docs to make the vectors
-        # for the target, so we don't lose the original tokens. But if
-        # you modified the docs in place here, you would.
-        masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
-    return mask, masked_docs
-
-
-def _replace_word(word, random_words, mask="[MASK]"):
-    roll = numpy.random.random()
-    if roll < 0.8:
-        return mask
-    elif roll < 0.9:
-        return random_words.next()
-    else:
-        return word
-
-
-def _uniform_init(lo, hi):
-    def wrapped(W, ops):
-        copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
-
-    return wrapped
-
-
-@describe.attributes(
-    nM=Dimension("Vector dimensions"),
-    nC=Dimension("Number of characters per word"),
-    vectors=Synapses(
-        "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
-    ),
-    d_vectors=Gradient("vectors"),
-)
-class CharacterEmbed(Model):
-    def __init__(self, nM=None, nC=None, **kwargs):
-        Model.__init__(self, **kwargs)
-        self.nM = nM
-        self.nC = nC
-
-    @property
-    def nO(self):
-        return self.nM * self.nC
-
-    @property
-    def nV(self):
-        return 256
-
-    def begin_update(self, docs, drop=0.0):
-        if not docs:
-            return []
-        ids = []
-        output = []
-        weights = self.vectors
-        # This assists in indexing; it's like looping over this dimension.
-        # Still consider this weird witch craft...But thanks to Mark Neumann
-        # for the tip.
-        nCv = self.ops.xp.arange(self.nC)
-        for doc in docs:
-            doc_ids = doc.to_utf8_array(nr_char=self.nC)
-            doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
-            # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
-            # incantation do I chant to get
-            # output[i, j, k] == data[j, ids[i, j], k]?
-            doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
-            output.append(doc_vectors.reshape((len(doc), self.nO)))
-            ids.append(doc_ids)
-
-        def backprop_character_embed(d_vectors, sgd=None):
-            gradient = self.d_vectors
-            for doc_ids, d_doc_vectors in zip(ids, d_vectors):
-                d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
-                gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return None
-
-        return output, backprop_character_embed
-
-
-def get_cossim_loss(yh, y, ignore_zeros=False):
-    xp = get_array_module(yh)
-    # Find the zero vectors
-    if ignore_zeros:
-        zero_indices = xp.abs(y).sum(axis=1) == 0
-    # Add a small constant to avoid 0 vectors
-    yh = yh + 1e-8
-    y = y + 1e-8
-    # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
-    norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
-    norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
-    mul_norms = norm_yh * norm_y
-    cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
-    d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
-    losses = xp.abs(cosine - 1)
-    if ignore_zeros:
-        # If the target was a zero vector, don't count it in the loss.
-        d_yh[zero_indices] = 0
-        losses[zero_indices] = 0
-    loss = losses.sum()
-    return loss, -d_yh
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,6 +4,7 @@ from .link import link  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
 from .train import train  # noqa: F401
+from .train_from_config import train_from_config_cli # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -4,19 +4,21 @@ import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.v2v import Affine, Maxout
-from thinc.misc import LayerNorm as LN
-from thinc.neural.util import prefer_gpu
+from thinc.layers import Linear, Maxout
+from thinc.util import prefer_gpu
 from wasabi import msg
 import srsly
+from thinc.layers import chain, list2array
+from thinc.loss import CosineDistance, L2Distance

 from spacy.gold import Example
 from ..errors import Errors
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
-from .._ml import masked_language_model, get_cossim_loss
+from ..ml.component_models import Tok2Vec
+from ..ml.component_models import masked_language_model
 from .. import util
+from ..util import create_default_optimizer
 from .train import _load_pretrained_tok2vec


@ -99,7 +101,7 @@ def pretrain(
    with msg.loading(f"Loading model '{vectors_model}'..."):
        nlp = util.load_model(vectors_model)
    msg.good(f"Loaded model '{vectors_model}'")
-    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
+    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
@ -136,7 +138,7 @@ def pretrain(
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
        epoch_start = 0

-    optimizer = create_default_optimizer(model.ops)
+    optimizer = create_default_optimizer()
    tracker = ProgressTracker(frequency=10000)
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
@ -251,13 +253,14 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
+    # TODO: this code originally didn't normalize, but shouldn't normalize=True ?
    if objective == "L2":
-        d_target = prediction - target
-        loss = (d_target ** 2).sum()
+        distance = L2Distance(normalize=False)
    elif objective == "cosine":
-        loss, d_target = get_cossim_loss(prediction, target)
+        distance = CosineDistance(normalize=False)
    else:
        raise ValueError(Errors.E142.format(loss_func=objective))
+    d_target, loss = distance(prediction, target)
    return loss, d_target


@ -269,18 +272,18 @@ def create_pretraining_model(nlp, tok2vec):
    """
    output_size = nlp.vocab.vectors.data.shape[1]
    output_layer = chain(
-        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+        Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size)
    )
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann
    # "tok2vec" has to be the same set of processes as what the components do.
-    tok2vec = chain(tok2vec, flatten)
+    tok2vec = chain(tok2vec, list2array())
    model = chain(tok2vec, output_layer)
    model = masked_language_model(nlp.vocab, model)
-    model.tok2vec = tok2vec
-    model.output_layer = output_layer
-    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("output_layer", output_layer)
+    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
    return model


--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -5,7 +5,7 @@ import cProfile
 import pstats
 import sys
 import itertools
-import thinc.extra.datasets
+import ml_datasets
 from wasabi import msg

 from ..util import load_model
@ -29,7 +29,7 @@ def profile(
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
-            imdb_train, _ = thinc.extra.datasets.imdb()
+            imdb_train, _ = ml_datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
        inputs = inputs[:n_inputs]
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,7 +1,7 @@
 import os
 import tqdm
 from pathlib import Path
-from thinc.neural._classes.model import Model
+from thinc.backends import use_ops
 from timeit import default_timer as timer
 import shutil
 import srsly
@ -9,7 +9,7 @@ from wasabi import msg
 import contextlib
 import random

-from .._ml import create_default_optimizer
+from ..util import create_default_optimizer
 from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus
 from .. import util
@ -200,7 +200,7 @@ def train(

    if base_model:
        # Start with an existing model, use default optimizer
-        optimizer = create_default_optimizer(Model.ops)
+        optimizer = create_default_optimizer()
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
@ -367,7 +367,7 @@ def train(
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
-                        with Model.use_device("cpu"):
+                        with use_ops("numpy"):
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -0,0 +1,445 @@
+import plac
+from thinc.util import require_gpu
+from wasabi import msg
+from pathlib import Path
+import thinc
+import thinc.schedules
+from thinc.model import Model
+from spacy.gold import GoldCorpus
+import spacy
+from spacy.pipeline.tok2vec import Tok2VecListener
+from typing import Optional, Dict, List, Union, Sequence
+from pydantic import BaseModel, FilePath, StrictInt
+import tqdm
+
+from ..ml import component_models
+from .. import util
+
+registry = util.registry
+
+CONFIG_STR = """
+[training]
+patience = 10
+eval_frequency = 10
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = false
+max_length = 0
+use_gpu = 0
+scores = ["ents_p",  "ents_r", "ents_f"]
+score_weights = {"ents_f": 1.0}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "transition_based_ner.v1"
+nr_feature_tokens = 3
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "hash_embed_cnn.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 128
+depth = 4
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+"""
+
+
+class PipelineComponent(BaseModel):
+    factory: str
+    model: Model
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class ConfigSchema(BaseModel):
+    optimizer: Optional["Optimizer"]
+
+    class training(BaseModel):
+        patience: int = 10
+        eval_frequency: int = 100
+        dropout: float = 0.2
+        init_tok2vec: Optional[FilePath] = None
+        vectors: Optional[str] = None
+        max_epochs: int = 100
+        orth_variant_level: float = 0.0
+        gold_preproc: bool = False
+        max_length: int = 0
+        use_gpu: int = 0
+        scores: List[str] = ["ents_p", "ents_r", "ents_f"]
+        score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
+        limit: int = 0
+        batch_size: Union[Sequence[int], int]
+
+    class nlp(BaseModel):
+        lang: str
+        vectors: Optional[str]
+        pipeline: Optional[Dict[str, PipelineComponent]]
+
+    class Config:
+        extra = "allow"
+
+
+# Of course, these would normally decorate the functions where they're defined.
+# But for now...
+@registry.architectures.register("hash_embed_cnn.v1")
+def hash_embed_cnn(
+    pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size
+):
+    return component_models.Tok2Vec(
+        width=width,
+        embed_size=embed_size,
+        pretrained_vectors=pretrained_vectors,
+        conv_depth=depth,
+        cnn_maxout_pieces=maxout_pieces,
+        bilstm_depth=0,
+        window_size=window_size,
+    )
+
+
+@registry.architectures.register("hash_embed_bilstm.v1")
+def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size):
+    return component_models.Tok2Vec(
+        width=width,
+        embed_size=embed_size,
+        pretrained_vectors=pretrained_vectors,
+        bilstm_depth=depth,
+        conv_depth=0,
+        cnn_maxout_pieces=0,
+    )
+
+
+@registry.architectures.register("tagger_model.v1")
+def build_tagger_model_v1(tok2vec):
+    return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec)
+
+
+@registry.architectures.register("transition_based_parser.v1")
+def create_tb_parser_model(
+    tok2vec: Model,
+    nr_feature_tokens: StrictInt = 3,
+    hidden_width: StrictInt = 64,
+    maxout_pieces: StrictInt = 3,
+):
+    from thinc.layers import Linear, chain, list2array
+    from spacy.ml._layers import PrecomputableAffine
+    from spacy.syntax._parser_model import ParserModel
+    from thinc.api import use_ops, zero_init
+
+    token_vector_width = tok2vec.get_dim("nO")
+    tok2vec = chain(tok2vec, list2array())
+    tok2vec.set_dim("nO", token_vector_width)
+
+    lower = PrecomputableAffine(
+        hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces
+    )
+    lower.set_dim("nP", maxout_pieces)
+    with use_ops("numpy"):
+        # Initialize weights at zero, as it's a classification layer.
+        upper = Linear(init_W=zero_init)
+    return ParserModel(tok2vec, lower, upper)
+
+
+@plac.annotations(
+    # fmt: off
+    train_path=("Location of JSON-formatted training data", "positional", None, Path),
+    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
+    config_path=("Path to config file", "positional", None, Path),
+    output_path=("Output directory to store model in", "option", "o", Path),
+    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
+    raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
+    # fmt: on
+)
+def train_from_config_cli(
+    train_path,
+    dev_path,
+    config_path,
+    output_path=None,
+    meta_path=None,
+    raw_text=None,
+    debug=False,
+    verbose=False,
+):
+    """
+    Train or update a spaCy model. Requires data to be formatted in spaCy's
+    JSON format. To convert data from other formats, use the `spacy convert`
+    command.
+    """
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)
+    if not train_path or not train_path.exists():
+        msg.fail("Training data not found", train_path, exits=1)
+    if not dev_path or not dev_path.exists():
+        msg.fail("Development data not found", dev_path, exits=1)
+    if meta_path is not None and not meta_path.exists():
+        msg.fail("Can't find model meta.json", meta_path, exits=1)
+    if output_path is not None and not output_path.exists():
+        output_path.mkdir()
+
+    try:
+        train_from_config(
+            config_path,
+            {"train": train_path, "dev": dev_path},
+            output_path=output_path,
+            meta_path=meta_path,
+            raw_text=raw_text,
+        )
+    except KeyboardInterrupt:
+        msg.warn("Cancelled.")
+
+
+def train_from_config(
+    config_path,
+    data_paths,
+    raw_text=None,
+    meta_path=None,
+    output_path=None,
+):
+    msg.info("Loading config from: {}".format(config_path))
+    config = util.load_from_config(config_path, create_objects=True)
+    use_gpu = config["training"]["use_gpu"]
+    if use_gpu >= 0:
+        msg.info("Using GPU")
+    else:
+        msg.info("Using CPU")
+    msg.info("Creating nlp from config")
+    nlp = create_nlp_from_config(**config["nlp"])
+    optimizer = config["optimizer"]
+    limit = config["training"]["limit"]
+    msg.info("Loading training corpus")
+    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
+    msg.info("Initializing the nlp pipeline")
+    nlp.begin_training(
+        lambda: corpus.train_examples, device=use_gpu
+    )
+
+    train_batches = create_train_batches(nlp, corpus, config["training"])
+    evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
+
+    # Create iterator, which yields out info after each optimization step.
+    msg.info("Start training")
+    training_step_iterator = train_while_improving(
+        nlp,
+        optimizer,
+        train_batches,
+        evaluate,
+        config["training"]["dropout"],
+        config["training"]["patience"],
+        config["training"]["eval_frequency"],
+    )
+
+    msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
+    print_row = setup_printer(config)
+
+    try:
+        progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            progress.update(1)
+            if is_best_checkpoint is not None:
+                progress.close()
+                print_row(info)
+                if is_best_checkpoint and output_path is not None:
+                    nlp.to_disk(output_path)
+                progress = tqdm.tqdm(
+                    total=config["training"]["eval_frequency"], leave=False
+                )
+    finally:
+        if output_path is not None:
+            with nlp.use_params(optimizer.averages):
+                final_model_path = output_path / "model-final"
+                nlp.to_disk(final_model_path)
+            msg.good("Saved model to output directory", final_model_path)
+        # with msg.loading("Creating best model..."):
+        #     best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
+        # msg.good("Created best model", best_model_path)
+
+
+def create_nlp_from_config(lang, vectors, pipeline):
+    lang_class = spacy.util.get_lang_class(lang)
+    nlp = lang_class()
+    if vectors is not None:
+        spacy.cli.train._load_vectors(nlp, vectors)
+    for name, component_cfg in pipeline.items():
+        factory = component_cfg.pop("factory")
+        component = nlp.create_pipe(factory, config=component_cfg)
+        nlp.add_pipe(component, name=name)
+    return nlp
+
+
+def create_train_batches(nlp, corpus, cfg):
+    while True:
+        train_examples = corpus.train_dataset(
+            nlp,
+            noise_level=0.0,
+            orth_variant_level=cfg["orth_variant_level"],
+            gold_preproc=cfg["gold_preproc"],
+            max_length=cfg["max_length"],
+            ignore_misaligned=True,
+        )
+        for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]):
+            yield batch
+
+
+def create_evaluation_callback(nlp, optimizer, corpus, cfg):
+    def evaluate():
+        with nlp.use_params(optimizer.averages):
+            dev_examples = list(
+                corpus.dev_dataset(
+                    nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
+                )
+            )
+            scorer = nlp.evaluate(dev_examples)
+            scores = scorer.scores
+            # Calculate a weighted sum based on score_weights for the main score
+            weights = cfg["score_weights"]
+            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+        return weighted_score, scorer.scores
+
+    return evaluate
+
+
+def train_while_improving(
+    nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency
+):
+    """Train until an evaluation stops improving. Works as a generator,
+    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
+    where info is a dict, and is_best_checkpoint is in [True, False, None] --
+    None indicating that the iteration was not evaluated as a checkpoint.
+    The evaluation is conducted by calling the evaluate callback, which should
+
+    Positional arguments:
+        nlp: The spaCy pipeline to evaluate.
+        train_data (Iterable[Batch]): A generator of batches, with the training
+            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
+            data iterable needs to take care of iterating over the epochs and
+            shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score form the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        loss: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = thinc.schedules.constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses = {}
+    for step, batch in enumerate(train_data):
+        dropout = next(dropouts)
+        for subbatch in subdivide_batch(batch):
+            nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
+        for name, proc in nlp.pipeline:
+            if hasattr(proc, "model"):
+                proc.model.finish_update(optimizer)
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            score, other_scores = evaluate()
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        info = {
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if no improvement in `patience` updates
+        best_score, best_step = max(results)
+        if (step - best_step) >= patience:
+            break
+
+
+def subdivide_batch(batch):
+    return [batch]
+
+
+def setup_printer(config):
+    score_cols = config["training"]["scores"]
+    score_widths = [max(len(col), 6) for col in score_cols]
+    loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
+    loss_widths = [max(len(col), 8) for col in loss_cols]
+    table_header = ["#"] + loss_cols + score_cols + ["Score"]
+    table_header = [col.upper() for col in table_header]
+    table_widths = [6] + loss_widths + score_widths + [6]
+    table_aligns = ["r" for _ in table_widths]
+
+    msg.row(table_header, widths=table_widths)
+    msg.row(["-" * width for width in table_widths])
+
+    def print_row(info):
+        losses = [
+            "{0:.2f}".format(info["losses"].get(col, 0.0))
+            for col in config["nlp"]["pipeline"]
+        ]
+        scores = [
+            "{0:.2f}".format(info["other_scores"].get(col, 0.0))
+            for col in config["training"]["scores"]
+        ]
+        data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
+        msg.row(data, widths=table_widths, aligns=table_aligns)
+
+    return print_row
+
+
+@registry.architectures.register("tok2vec_tensors.v1")
+def tok2vec_tensors_v1(width):
+    tok2vec = Tok2VecListener("tok2vec", width=width)
+    return tok2vec
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -8,7 +8,7 @@ DOCS: https://spacy.io/api/top-level#compat
 import os
 import sys

-from thinc.neural.util import copy_array
+from thinc.util import copy_array

 try:
    import cPickle as pickle
@ -30,10 +30,7 @@ try:
 except ImportError:
    cupy = None

-try:
-    from thinc.neural.optimizers import Optimizer  # noqa: F401
-except ImportError:
-    from thinc.neural.optimizers import Adam as Optimizer  # noqa: F401
+from thinc.optimizers import Optimizer  # noqa: F401

 pickle = pickle
 copy_reg = copy_reg
--- a/spacy/language.py
+++ b/spacy/language.py
@ -4,7 +4,8 @@ import weakref
 import functools
 from contextlib import contextmanager
 from copy import copy, deepcopy
-from thinc.neural import Model
+from thinc.model import Model
+from thinc.backends import get_current_ops
 import srsly
 import multiprocessing as mp
 from itertools import chain, cycle
@ -16,7 +17,7 @@ from .lookups import Lookups
 from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
 from .gold import Example
 from .scorer import Scorer
-from ._ml import link_vectors_to_models, create_default_optimizer
+from .util import link_vectors_to_models, create_default_optimizer
 from .attrs import IS_STOP, LANG
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
@ -468,30 +469,27 @@ class Language(object):

        if sgd is None:
            if self._optimizer is None:
-                self._optimizer = create_default_optimizer(Model.ops)
+                self._optimizer = create_default_optimizer()
            sgd = self._optimizer

-        grads = {}
-
-        def get_grads(W, dW, key=None):
-            grads[key] = (W, dW)
-
-        get_grads.alpha = sgd.alpha
-        get_grads.b1 = sgd.b1
-        get_grads.b2 = sgd.b2
-        pipes = list(self.pipeline)
-        random.shuffle(pipes)
        if component_cfg is None:
            component_cfg = {}
-        for name, proc in pipes:
+        # Determine whether component should set annotations. In theory I guess
+        # we should do this by inspecting the meta? Or we could just always
+        # say "yes"
+        for name, proc in self.pipeline:
+            component_cfg.setdefault(name, {})
+            component_cfg[name].setdefault("drop", drop)
+            component_cfg[name].setdefault("set_annotations", False)
+        grads = {}
+        for name, proc in self.pipeline:
            if not hasattr(proc, "update"):
                continue
-            grads = {}
-            kwargs = component_cfg.get(name, {})
-            kwargs.setdefault("drop", drop)
-            proc.update(examples, sgd=get_grads, losses=losses, **kwargs)
-            for key, (W, dW) in grads.items():
-                sgd(W, dW, key=key)
+            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
+        if sgd is not False:
+            for name, proc in self.pipeline:
+                if hasattr(proc, "model"):
+                    proc.model.finish_update(sgd)

    def rehearse(self, examples, sgd=None, losses=None, config=None):
        """Make a "rehearsal" update to the models in the pipeline, to prevent
@ -518,7 +516,7 @@ class Language(object):
        examples = Example.to_example_objects(examples, make_doc=self.make_doc)
        if sgd is None:
            if self._optimizer is None:
-                self._optimizer = create_default_optimizer(Model.ops)
+                self._optimizer = create_default_optimizer()
            sgd = self._optimizer
        pipes = list(self.pipeline)
        random.shuffle(pipes)
@ -529,7 +527,7 @@ class Language(object):
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)

-        get_grads.alpha = sgd.alpha
+        get_grads.learn_rate = sgd.learn_rate
        get_grads.b1 = sgd.b1
        get_grads.b2 = sgd.b2
        for name, proc in pipes:
@ -537,8 +535,8 @@ class Language(object):
                continue
            grads = {}
            proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {}))
-            for key, (W, dW) in grads.items():
-                sgd(W, dW, key=key)
+        for key, (W, dW) in grads.items():
+            sgd(W, dW, key=key)
        return losses

    def preprocess_gold(self, examples):
@ -577,12 +575,13 @@ class Language(object):
        if cfg.get("device", -1) >= 0:
            util.use_gpu(cfg["device"])
            if self.vocab.vectors.data.shape[1] >= 1:
-                self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
+                ops = get_current_ops()
+                self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
        link_vectors_to_models(self.vocab)
        if self.vocab.vectors.data.shape[1]:
-            cfg["pretrained_vectors"] = self.vocab.vectors.name
+            cfg["pretrained_vectors"] = self.vocab.vectors
        if sgd is None:
-            sgd = create_default_optimizer(Model.ops)
+            sgd = create_default_optimizer()
        self._optimizer = sgd
        if component_cfg is None:
            component_cfg = {}
@ -596,6 +595,7 @@ class Language(object):
                    sgd=self._optimizer,
                    **kwargs
                )
+        self._link_components()
        return self._optimizer

    def resume_training(self, sgd=None, **cfg):
@ -609,13 +609,14 @@ class Language(object):
        """
        if cfg.get("device", -1) >= 0:
            util.use_gpu(cfg["device"])
+            ops = get_current_ops()
            if self.vocab.vectors.data.shape[1] >= 1:
-                self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
+                self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
        link_vectors_to_models(self.vocab)
        if self.vocab.vectors.data.shape[1]:
-            cfg["pretrained_vectors"] = self.vocab.vectors.name
+            cfg["pretrained_vectors"] = self.vocab.vectors
        if sgd is None:
-            sgd = create_default_optimizer(Model.ops)
+            sgd = create_default_optimizer()
        self._optimizer = sgd
        for name, proc in self.pipeline:
            if hasattr(proc, "_rehearsal_model"):
@ -736,7 +737,7 @@ class Language(object):
                disable=disable,
                n_process=n_process,
                component_cfg=component_cfg,
-                as_example=False
+                as_example=False      # TODO: shouldn't this be as_example=as_example ?
            )
            for doc, context in zip(docs, contexts):
                yield (doc, context)
@ -838,6 +839,16 @@ class Language(object):
            for proc in procs:
                proc.terminate()

+    def _link_components(self):
+        """Register 'listeners' within pipeline components, to allow them to
+        effectively share weights.
+        """
+        for i, (name1, proc1) in enumerate(self.pipeline):
+            if hasattr(proc1, "find_listeners"):
+                for name2, proc2 in self.pipeline[i:]:
+                    if hasattr(proc2, "model"):
+                        proc1.find_listeners(proc2.model)
+
    def to_disk(self, path, exclude=tuple(), disable=None):
        """Save the current state to a directory.  If a model is loaded, this
        will include the model.
@ -906,6 +917,7 @@ class Language(object):
            exclude = list(exclude) + ["vocab"]
        util.from_disk(path, deserializers, exclude)
        self._path = path
+        self._link_components()
        return self

    def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
@ -962,6 +974,7 @@ class Language(object):
            )
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        util.from_bytes(bytes_data, deserializers, exclude)
+        self._link_components()
        return self


--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -6,7 +6,7 @@ cimport numpy as np
 np.import_array()

 import numpy
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module

 from .typedefs cimport attr_t, flags_t
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
--- a/spacy/ml/init.py
+++ b/spacy/ml/init.py
@ -1,2 +0,0 @@
-from .tok2vec import Tok2Vec  # noqa: F401
-from .common import FeedForward, LayerNormalizedMaxout  # noqa: F401
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@ -0,0 +1,52 @@
+from thinc.api import Model
+
+
+def CharacterEmbed(nM, nC):
+    # nM: Number of dimensions per character. nC: Number of characters.
+    nO = nM*nC if (nM is not None and nC is not None) else None
+    return Model(
+        "charembed",
+        forward,
+        init=init,
+        dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
+        params={"E": None}
+    ).initialize()
+
+
+def init(model, X=None, Y=None):
+    vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
+    model.set_param("E", vectors_table)
+
+
+def forward(model, docs, is_train):
+    if not docs:
+        return []
+    ids = []
+    output = []
+    E = model.get_param("E")
+    nC = model.get_dim("nC")
+    nM = model.get_dim("nM")
+    nO = model.get_dim("nO")
+    # This assists in indexing; it's like looping over this dimension.
+    # Still consider this weird witch craft...But thanks to Mark Neumann
+    # for the tip.
+    nCv = model.ops.xp.arange(nC)
+    for doc in docs:
+        doc_ids = doc.to_utf8_array(nr_char=nC)
+        doc_vectors = model.ops.alloc3f(len(doc), nC, nM)
+        # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
+        # incantation do I chant to get
+        # output[i, j, k] == data[j, ids[i, j], k]?
+        doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]]
+        output.append(doc_vectors.reshape((len(doc), nO)))
+        ids.append(doc_ids)
+
+    def backprop(d_output):
+        dE = model.ops.alloc(E.shape, dtype=E.dtype)
+        for doc_ids, d_doc_vectors in zip(ids, d_output):
+            d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM))
+            dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
+        model.inc_grad("E", dE)
+        return []
+
+    return output, backprop
--- a/spacy/ml/_layers.py
+++ b/spacy/ml/_layers.py
@ -0,0 +1,165 @@
+from thinc.model import Model
+from thinc.api import normal_init
+
+
+def PrecomputableAffine(nO, nI, nF, nP):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+    )
+    model.initialize()
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    Yf = model.ops.gemm(
+        X, W.reshape((nF * nO * nP, nI)), trans2=True
+    )
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+    Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        # 
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = model.ops.xp.ascontiguousarray(Wopfi)
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        # Reuse the buffer
+        dWopfi = Wopfi
+        dWopfi.fill(0.0)
+        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        model.inc_grad("W", dWopfi.transpose((2, 0, 1, 3)))
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_padding[0, f] += dY[b]
+    # 
+    # Which can be rewritten as:
+    #
+    # for b in range(nB):
+    #     d_pad[0, ids[b] < 0] += dY[b]
+    # 
+    # I don't know how to avoid the loop without building a whole array :(.
+    # Cursed numpy.
+    d_pad = model.ops.alloc((1, nF, nO, nP))
+    for b in range(nB):
+        d_pad[0, ids[b] < 0] += dY[b]
+    return d_pad
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, fan_in=nF*nI)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
--- a/spacy/ml/_legacy_tok2vec.py
+++ b/spacy/ml/_legacy_tok2vec.py
@ -1,129 +0,0 @@
-from thinc.v2v import Model, Maxout
-from thinc.i2v import HashEmbed, StaticVectors
-from thinc.t2t import ExtractWindow
-from thinc.misc import Residual
-from thinc.misc import LayerNorm as LN
-from thinc.misc import FeatureExtracter
-from thinc.api import layerize, chain, clone, concatenate, with_flatten
-from thinc.api import uniqued, wrap, noop
-
-from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
-
-
-def Tok2Vec(width, embed_size, **kwargs):
-    # Circular imports :(
-    from .._ml import CharacterEmbed
-    from .._ml import PyTorchBiLSTM
-
-    pretrained_vectors = kwargs.get("pretrained_vectors", None)
-    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
-    subword_features = kwargs.get("subword_features", True)
-    char_embed = kwargs.get("char_embed", False)
-    if char_embed:
-        subword_features = False
-    conv_depth = kwargs.get("conv_depth", 4)
-    bilstm_depth = kwargs.get("bilstm_depth", 0)
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
-        if subword_features:
-            prefix = HashEmbed(
-                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
-            )
-            suffix = HashEmbed(
-                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
-            )
-            shape = HashEmbed(
-                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
-            )
-        else:
-            prefix, suffix, shape = (None, None, None)
-        if pretrained_vectors is not None:
-            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
-
-            if subword_features:
-                embed = uniqued(
-                    (glove | norm | prefix | suffix | shape)
-                    >> LN(Maxout(width, width * 5, pieces=3)),
-                    column=cols.index(ORTH),
-                )
-            else:
-                embed = uniqued(
-                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
-                    column=cols.index(ORTH),
-                )
-        elif subword_features:
-            embed = uniqued(
-                (norm | prefix | suffix | shape)
-                >> LN(Maxout(width, width * 4, pieces=3)),
-                column=cols.index(ORTH),
-            )
-        elif char_embed:
-            embed = concatenate_lists(
-                CharacterEmbed(nM=64, nC=8),
-                FeatureExtracter(cols) >> with_flatten(norm),
-            )
-            reduce_dimensions = LN(
-                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
-            )
-        else:
-            embed = norm
-
-        convolution = Residual(
-            ExtractWindow(nW=1)
-            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
-        )
-        if char_embed:
-            tok2vec = embed >> with_flatten(
-                reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
-            )
-        else:
-            tok2vec = FeatureExtracter(cols) >> with_flatten(
-                embed >> convolution ** conv_depth, pad=conv_depth
-            )
-
-        if bilstm_depth >= 1:
-            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
-        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
-        tok2vec.nO = width
-        tok2vec.embed = embed
-    return tok2vec
-
-
-@layerize
-def flatten(seqs, drop=0.0):
-    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths, pad=0)
-
-    X = ops.flatten(seqs, pad=0)
-    return X, finish_update
-
-
-def concatenate_lists(*layers, **kwargs):  # pragma: no cover
-    """Compose two or more models `f`, `g`, etc, such that their outputs are
-    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
-    """
-    if not layers:
-        return noop()
-    drop_factor = kwargs.get("drop_factor", 1.0)
-    ops = layers[0].ops
-    layers = [chain(layer, flatten) for layer in layers]
-    concat = concatenate(*layers)
-
-    def concatenate_lists_fwd(Xs, drop=0.0):
-        if drop is not None:
-            drop *= drop_factor
-        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
-        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
-        ys = ops.unflatten(flat_y, lengths)
-
-        def concatenate_lists_bwd(d_ys, sgd=None):
-            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
-        return ys, concatenate_lists_bwd
-
-    model = wrap(concatenate_lists_fwd, concat)
-    return model
--- a/spacy/ml/_wire.py
+++ b/spacy/ml/_wire.py
@ -1,41 +0,0 @@
-from thinc.api import layerize, wrap, noop, chain, concatenate
-from thinc.v2v import Model
-
-
-def concatenate_lists(*layers, **kwargs):  # pragma: no cover
-    """Compose two or more models `f`, `g`, etc, such that their outputs are
-    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
-    """
-    if not layers:
-        return layerize(noop())
-    drop_factor = kwargs.get("drop_factor", 1.0)
-    ops = layers[0].ops
-    layers = [chain(layer, flatten) for layer in layers]
-    concat = concatenate(*layers)
-
-    def concatenate_lists_fwd(Xs, drop=0.0):
-        if drop is not None:
-            drop *= drop_factor
-        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
-        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
-        ys = ops.unflatten(flat_y, lengths)
-
-        def concatenate_lists_bwd(d_ys, sgd=None):
-            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
-        return ys, concatenate_lists_bwd
-
-    model = wrap(concatenate_lists_fwd, concat)
-    return model
-
-
-@layerize
-def flatten(seqs, drop=0.0):
-    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths, pad=0)
-
-    X = ops.flatten(seqs, pad=0)
-    return X, finish_update
--- a/spacy/ml/common.py
+++ b/spacy/ml/common.py
@ -1,21 +0,0 @@
-from thinc.api import chain
-from thinc.v2v import Maxout
-from thinc.misc import LayerNorm
-from ..util import registry, make_layer
-
-
-@registry.architectures.register("thinc.FeedForward.v1")
-def FeedForward(config):
-    layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]]
-    model = chain(*layers)
-    model.cfg = config
-    return model
-
-
-@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
-def LayerNormalizedMaxout(config):
-    width = config["width"]
-    pieces = config["pieces"]
-    layer = LayerNorm(Maxout(width, pieces=pieces))
-    layer.nO = width
-    return layer
--- a/spacy/ml/component_models.py
+++ b/spacy/ml/component_models.py
@ -0,0 +1,222 @@
+from spacy import util
+from spacy.ml.extract_ngrams import extract_ngrams
+
+from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
+from ..errors import Errors
+from ._character_embed import CharacterEmbed
+
+from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
+from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
+from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
+from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
+from thinc.api import zero_init, glorot_uniform_init
+
+
+def build_text_classifier(arch, config):
+    if arch == "cnn":
+        return build_simple_cnn_text_classifier(**config)
+    elif arch == "bow":
+        return build_bow_text_classifier(**config)
+    else:
+        raise ValueError("Unexpected textcat arch")
+
+
+def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg):
+    """
+    Build a simple CNN text classifier, given a token-to-vector model as inputs.
+    If exclusive_classes=True, a softmax non-linearity is applied, so that the
+    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
+    is applied instead, so that outputs are in the range [0, 1].
+    """
+    with Model.define_operators({">>": chain}):
+        if exclusive_classes:
+            output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
+        else:
+            # TODO: experiment with init_w=zero_init
+            output_layer = (
+                Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
+                >> Logistic()
+            )
+        model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
+    model.set_ref("tok2vec", tok2vec)
+    model.set_dim("nO", nr_class)
+    return model
+
+
+def build_bow_text_classifier(
+    nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg
+):
+    with Model.define_operators({">>": chain}):
+        model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class)
+        model.to_cpu()
+        if not no_output_layer:
+            output_layer = (
+                Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class)
+            )
+            output_layer.to_cpu()
+            model = model >> output_layer
+    model.set_dim("nO", nr_class)
+    return model
+
+
+def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
+    if "entity_width" not in cfg:
+        raise ValueError(Errors.E144.format(param="entity_width"))
+
+    conv_depth = cfg.get("conv_depth", 2)
+    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
+    pretrained_vectors = cfg.get("pretrained_vectors", None)
+    context_width = cfg.get("entity_width")
+
+    with Model.define_operators({">>": chain, "**": clone}):
+        nel_tok2vec = Tok2Vec(
+            width=hidden_width,
+            embed_size=embed_width,
+            pretrained_vectors=pretrained_vectors,
+            cnn_maxout_pieces=cnn_maxout_pieces,
+            subword_features=True,
+            conv_depth=conv_depth,
+            bilstm_depth=0,
+        )
+
+        model = (
+            nel_tok2vec
+            >> list2ragged()
+            >> reduce_mean()
+            >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0))
+            >> Linear(nO=context_width, nI=hidden_width)
+        )
+        model.initialize()
+
+        model.set_ref("tok2vec", nel_tok2vec)
+        model.set_dim("nO", context_width)
+    return model
+
+
+def masked_language_model(*args, **kwargs):
+    raise NotImplementedError
+
+
+def build_tagger_model(nr_class, tok2vec):
+    token_vector_width = tok2vec.get_dim("nO")
+    # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
+    softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init))
+    model = chain(tok2vec, softmax)
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("softmax", softmax)
+    return model
+
+
+def build_morphologizer_model(class_nums, **cfg):
+    embed_size = util.env_opt("embed_size", 7000)
+    if "token_vector_width" in cfg:
+        token_vector_width = cfg["token_vector_width"]
+    else:
+        token_vector_width = util.env_opt("token_vector_width", 128)
+    pretrained_vectors = cfg.get("pretrained_vectors")
+    char_embed = cfg.get("char_embed", True)
+    with Model.define_operators({">>": chain, "+": add, "**": clone}):
+        if "tok2vec" in cfg:
+            tok2vec = cfg["tok2vec"]
+        else:
+            tok2vec = Tok2Vec(
+                token_vector_width,
+                embed_size,
+                char_embed=char_embed,
+                pretrained_vectors=pretrained_vectors,
+            )
+        softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width))
+        model = tok2vec >> softmax
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("softmax", softmax)
+    return model
+
+
+def Tok2Vec(
+    width,
+    embed_size,
+    pretrained_vectors=None,
+    window_size=1,
+    cnn_maxout_pieces=3,
+    subword_features=True,
+    char_embed=False,
+    conv_depth=4,
+    bilstm_depth=0,
+):
+    if char_embed:
+        subword_features = False
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+        norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
+        if subword_features:
+            prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
+            suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
+            shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
+        else:
+            prefix, suffix, shape = (None, None, None)
+        if pretrained_vectors is not None:
+            glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
+
+            if subword_features:
+                embed = uniqued(
+                    (glove | norm | prefix | suffix | shape)
+                    >> Maxout(
+                        nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True
+                    ),
+                    column=cols.index(ORTH),
+                )
+            else:
+                embed = uniqued(
+                    (glove | norm)
+                    >> Maxout(
+                        nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True
+                    ),
+                    column=cols.index(ORTH),
+                )
+        elif subword_features:
+            embed = uniqued(
+                concatenate(norm, prefix, suffix, shape)
+                >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True),
+                column=cols.index(ORTH),
+            )
+        elif char_embed:
+            embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array(
+                norm
+            )
+            reduce_dimensions = Maxout(
+                nO=width,
+                nI=64 * 8 + width,
+                nP=cnn_maxout_pieces,
+                dropout=0.0,
+                normalize=True,
+            )
+        else:
+            embed = norm
+
+        convolution = residual(
+            expand_window(window_size=window_size)
+            >> Maxout(
+                nO=width,
+                nI=width * 3,
+                nP=cnn_maxout_pieces,
+                dropout=0.0,
+                normalize=True,
+            )
+        )
+        if char_embed:
+            tok2vec = embed >> with_array(
+                reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
+            )
+        else:
+            tok2vec = FeatureExtractor(cols) >> with_array(
+                embed >> convolution ** conv_depth, pad=conv_depth
+            )
+
+        if bilstm_depth >= 1:
+            tok2vec = tok2vec >> PyTorchLSTM(
+                nO=width, nI=width, depth=bilstm_depth, bi=True
+            )
+        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
+        tok2vec.set_dim("nO", width)
+        tok2vec.set_ref("embed", embed)
+    return tok2vec
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@ -0,0 +1,39 @@
+import numpy
+from thinc.model import Model
+
+from ..attrs import LOWER
+
+
+def extract_ngrams(ngram_size, attr=LOWER) -> Model:
+    model = Model("extract_ngrams", forward)
+    model.attrs["ngram_size"] = ngram_size
+    model.attrs["attr"] = attr
+    return model
+
+
+def forward(self, docs, is_train: bool):
+    batch_keys = []
+    batch_vals = []
+    for doc in docs:
+        unigrams = doc.to_array([self.attrs["attr"]])
+        ngrams = [unigrams]
+        for n in range(2, self.attrs["ngram_size"] + 1):
+            ngrams.append(self.ops.ngrams(n, unigrams))
+        keys = self.ops.xp.concatenate(ngrams)
+        keys, vals = self.ops.xp.unique(keys, return_counts=True)
+        batch_keys.append(keys)
+        batch_vals.append(vals)
+    # The dtype here matches what thinc is expecting -- which differs per
+    # platform (by int definition). This should be fixed once the problem
+    # is fixed on Thinc's side.
+    lengths = self.ops.asarray(
+        [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
+    )
+    batch_keys = self.ops.xp.concatenate(batch_keys)
+    batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
+
+    def backprop(dY):
+        return dY
+
+    return (batch_keys, batch_vals, lengths), backprop
+
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@ -1,11 +1,12 @@
-from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
-from thinc.api import noop, with_square_sequences
-from thinc.v2v import Maxout, Model
-from thinc.i2v import HashEmbed, StaticVectors
-from thinc.t2t import ExtractWindow
-from thinc.misc import Residual, LayerNorm, FeatureExtracter
+from thinc.layers import chain, clone, concatenate, with_array, uniqued
+from thinc.model import Model
+from thinc.layers import noop, with_padded
+from thinc.layers import Maxout, expand_window
+from thinc.layers import HashEmbed, StaticVectors
+from thinc.layers import residual, LayerNorm, FeatureExtractor
+
+from spacy.ml import _character_embed
 from ..util import make_layer, registry
-from ._wire import concatenate_lists


@registry.architectures.register("spacy.Tok2Vec.v1")
@ -13,19 +14,21 @@ def Tok2Vec(config):
    doc2feats = make_layer(config["@doc2feats"])
    embed = make_layer(config["@embed"])
    encode = make_layer(config["@encode"])
-    field_size = getattr(encode, "receptive_field", 0)
-    tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size))
-    tok2vec.cfg = config
-    tok2vec.nO = encode.nO
-    tok2vec.embed = embed
-    tok2vec.encode = encode
+    field_size = 0
+    if encode.has_attr("receptive_field"):
+        field_size = encode.attrs["receptive_field"]
+    tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
+    tok2vec.attrs["cfg"] = config
+    tok2vec.set_dim("nO", encode.get_dim("nO"))
+    tok2vec.set_ref("embed", embed)
+    tok2vec.set_ref("encode", encode)
    return tok2vec


@registry.architectures.register("spacy.Doc2Feats.v1")
 def Doc2Feats(config):
    columns = config["columns"]
-    return FeatureExtracter(columns)
+    return FeatureExtractor(columns)


@registry.architectures.register("spacy.MultiHashEmbed.v1")
@ -40,55 +43,47 @@ def MultiHashEmbed(config):
    width = config["width"]
    rows = config["rows"]

-    norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")
+    norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0)
    if config["use_subwords"]:
-        prefix = HashEmbed(
-            width, rows // 2, column=cols.index("PREFIX"), name="embed_prefix"
-        )
-        suffix = HashEmbed(
-            width, rows // 2, column=cols.index("SUFFIX"), name="embed_suffix"
-        )
-        shape = HashEmbed(
-            width, rows // 2, column=cols.index("SHAPE"), name="embed_shape"
-        )
+        prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0)
+        suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0)
+        shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0)
    if config.get("@pretrained_vectors"):
        glove = make_layer(config["@pretrained_vectors"])
    mix = make_layer(config["@mix"])

    with Model.define_operators({">>": chain, "|": concatenate}):
        if config["use_subwords"] and config["@pretrained_vectors"]:
-            mix._layers[0].nI = width * 5
+            mix._layers[0].set_dim("nI", width * 5)
            layer = uniqued(
                (glove | norm | prefix | suffix | shape) >> mix,
                column=cols.index("ORTH"),
            )
        elif config["use_subwords"]:
-            mix._layers[0].nI = width * 4
+            mix._layers[0].set_dim("nI", width * 4)
            layer = uniqued(
                (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
            )
        elif config["@pretrained_vectors"]:
-            mix._layers[0].nI = width * 2
+            mix._layers[0].set_dim("nI", width * 2)
            layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
        else:
            layer = norm
-    layer.cfg = config
+    layer.attrs["cfg"] = config
    return layer


@registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(config):
-    from .. import _ml
-
    width = config["width"]
    chars = config["chars"]

-    chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars)
+    chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
    other_tables = make_layer(config["@embed_features"])
    mix = make_layer(config["@mix"])

-    model = chain(concatenate_lists(chr_embed, other_tables), mix)
-    model.cfg = config
+    model = chain(concatenate(chr_embed, other_tables), mix)
+    model.attrs["cfg"] = config
    return model


@ -99,48 +94,49 @@ def MaxoutWindowEncoder(config):
    nP = config["pieces"]
    depth = config["depth"]

-    cnn = chain(
-        ExtractWindow(nW=nW), LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP))
-    )
-    model = clone(Residual(cnn), depth)
-    model.nO = nO
-    model.receptive_field = nW * depth
+    cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
+    model = clone(residual(cnn), depth)
+    model.set_dim("nO", nO)
+    model.attrs["receptive_field"] = nW * depth
    return model


@registry.architectures.register("spacy.MishWindowEncoder.v1")
 def MishWindowEncoder(config):
-    from thinc.v2v import Mish
+    from thinc.layers import Mish

    nO = config["width"]
    nW = config["window_size"]
    depth = config["depth"]

-    cnn = chain(ExtractWindow(nW=nW), LayerNorm(Mish(nO, nO * ((nW * 2) + 1))))
-    model = clone(Residual(cnn), depth)
-    model.nO = nO
+    cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
+    model = clone(residual(cnn), depth)
+    model.set_dim("nO", nO)
    return model


@registry.architectures.register("spacy.PretrainedVectors.v1")
 def PretrainedVectors(config):
-    return StaticVectors(config["vectors_name"], config["width"], config["column"])
+    # TODO: actual vectors instead of name
+    return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)


@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
 def TorchBiLSTMEncoder(config):
    import torch.nn
-    from thinc.extra.wrappers import PyTorchWrapperRNN
+    # TODO FIX
+    from thinc.layers import PyTorchRNNWrapper

    width = config["width"]
    depth = config["depth"]
    if depth == 0:
-        return layerize(noop())
-    return with_square_sequences(
-        PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
+        return noop()
+    return with_padded(
+        PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
    )


+# TODO: update
 _EXAMPLE_CONFIG = {
    "@doc2feats": {
        "arch": "Doc2Feats",
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -3,6 +3,7 @@ from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
 from .pipes import SentenceRecognizer
 from .morphologizer import Morphologizer
 from .entityruler import EntityRuler
+from .tok2vec import Tok2Vec
 from .hooks import SentenceSegmenter, SimilarityHook
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens

@ -13,6 +14,7 @@ __all__ = [
    "EntityLinker",
    "TextCategorizer",
    "Tensorizer",
+    "Tok2Vec",
    "Pipe",
    "Morphologizer",
    "EntityRuler",
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -1,9 +1,8 @@
-from thinc.t2v import Pooling, max_pool, mean_pool
-from thinc.neural._classes.difference import Siamese, CauchySimilarity
+from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity

 from .pipes import Pipe
 from ..language import component
-from .._ml import link_vectors_to_models
+from ..util import link_vectors_to_models


@component("sentencizer_hook", assigns=["doc.user_hooks"])
@ -63,7 +62,10 @@ class SimilarityHook(Pipe):

    @classmethod
    def Model(cls, length):
-        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
+        return siamese(
+            concatenate(reduce_max(), reduce_mean()),
+            CauchySimilarity(length * 2)
+        )

    def __call__(self, doc):
        """Install similarity hook"""
@ -80,7 +82,7 @@ class SimilarityHook(Pipe):

    def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
        self.require_model()
-        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
+        sims, bp_sims = self.model.begin_update(doc1_doc2)

    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
        """Allocate model, using width from tensorizer in pipeline.
@ -89,7 +91,7 @@ class SimilarityHook(Pipe):
        pipeline (list): The pipeline the model is part of.
        """
        if self.model is True:
-            self.model = self.Model(pipeline[0].model.nO)
+            self.model = self.Model(pipeline[0].model.get_dim("nO"))
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -3,19 +3,20 @@ from collections import defaultdict
 import numpy
 cimport numpy as np

-from thinc.api import chain
-from thinc.neural.util import to_categorical, copy_array, get_array_module
+from thinc.layers import chain, list2array
+from thinc.util import to_categorical, copy_array, get_array_module
+
 from .. import util
 from .pipes import Pipe
 from ..language import component
-from .._ml import Tok2Vec, build_morphologizer_model
-from .._ml import link_vectors_to_models, zero_init, flatten
-from .._ml import create_default_optimizer
+from ..util import link_vectors_to_models, create_default_optimizer
 from ..errors import Errors, TempErrors
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..morphology cimport Morphology

+from ..ml.component_models import build_morphologizer_model
+

@component("morphologizer", assigns=["token.morph", "token.pos"])
 class Morphologizer(Pipe):
@ -43,7 +44,7 @@ class Morphologizer(Pipe):
        if self.model in (None, True, False):
            return None
        else:
-            return chain(self.model.tok2vec, flatten)
+            return chain(self.model.get_ref("tok2vec"), list2array())

    def __call__(self, doc):
        features, tokvecs = self.predict([doc])
@ -60,9 +61,9 @@ class Morphologizer(Pipe):
    def predict(self, docs):
        if not any(len(doc) for doc in docs):
            # Handle case where there are no tokens in any docs.
-            n_labels = self.model.nO
-            guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
-            tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
+            n_labels = self.model.get_dim("nO")
+            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
+            tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
            return guesses, tokvecs
        tokvecs = self.model.tok2vec(docs)
        scores = self.model.softmax(tokvecs)
@ -77,7 +78,7 @@ class Morphologizer(Pipe):
                   for field in self._class_map.fields]
        for i, doc in enumerate(docs):
            doc_scores = batch_scores[i]
-            doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
+            doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])
            # Convert the neuron indices into feature IDs.
            doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
            for j in range(len(doc)):
@ -110,7 +111,7 @@ class Morphologizer(Pipe):
    def get_loss(self, examples, scores):
        guesses = []
        for doc_scores in scores:
-            guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
+            guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]))
        guesses = self.model.ops.xp.vstack(guesses)
        scores = self.model.ops.xp.vstack(scores)
        if not isinstance(scores, numpy.ndarray):
@ -120,7 +121,7 @@ class Morphologizer(Pipe):
        cdef int idx = 0
        # Do this on CPU, as we can't vectorize easily.
        target = numpy.zeros(scores.shape, dtype='f')
-        field_sizes = self.model.softmax.out_sizes
+        field_sizes = self.model.get_ref("softmax").attrs["nOs"]
        for example in examples:
            doc = example.doc
            gold = example.gold
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -3,11 +3,11 @@
 import numpy
 import srsly
 import random
-from thinc.api import chain
-from thinc.v2v import Affine, Maxout, Softmax
-from thinc.misc import LayerNorm
-from thinc.neural.util import to_categorical
-from thinc.neural.util import get_array_module
+from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
+from thinc.initializers import zero_init
+from thinc.loss import CosineDistance
+from thinc.util import to_categorical, get_array_module
+from thinc.model import set_dropout_rate

 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
@ -21,13 +21,14 @@ from ..language import Language, component
 from ..syntax import nonproj
 from ..gold import Example
 from ..attrs import POS, ID
+from ..util import link_vectors_to_models, create_default_optimizer
 from ..parts_of_speech import X
 from ..kb import KnowledgeBase
-from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
-from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-from .._ml import build_bow_text_classifier, build_nel_encoder
-from .._ml import link_vectors_to_models, zero_init, flatten
-from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from ..ml.component_models import Tok2Vec, build_tagger_model
+from ..ml.component_models import build_text_classifier
+from ..ml.component_models import build_simple_cnn_text_classifier
+from ..ml.component_models import build_bow_text_classifier, build_nel_encoder
+from ..ml.component_models import masked_language_model
 from ..errors import Errors, TempErrors, user_warning, Warnings
 from .. import util

@ -126,13 +127,15 @@ class Pipe(object):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError

-    def update(self, examples, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model.

        Delegates to predict() and get_loss().
        """
-        pass
+        if set_annotations:
+            docs = (self._get_doc(ex) for ex in examples)
+            docs = list(self.pipe(docs))

    def rehearse(self, examples, sgd=None, losses=None, **config):
        pass
@ -152,7 +155,7 @@ class Pipe(object):
        raise NotImplementedError

    def create_optimizer(self):
-        return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
+        return create_default_optimizer()

    def begin_training(
        self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
@ -163,10 +166,30 @@ class Pipe(object):
            self.model = self.Model(**self.cfg)
        if hasattr(self, "vocab"):
            link_vectors_to_models(self.vocab)
+        self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

+    def get_gradients(self):
+        """Get non-zero gradients of the model's parameters, as a dictionary
+        keyed by the parameter ID. The values are (weights, gradients) tuples.
+        """
+        gradients = {}
+        if self.model in (None, True, False):
+            return gradients
+        queue = [self.model]
+        seen = set()
+        for node in queue:
+            if node.id in seen:
+                continue
+            seen.add(node.id)
+            if hasattr(node, "_mem") and node._mem.gradient.any():
+                gradients[node.id] = [node._mem.weights, node._mem.gradient]
+            if hasattr(node, "_layers"):
+                queue.extend(node._layers)
+        return gradients
+
    def use_params(self, params):
        """Modify the pipe's model, to use the given parameter values."""
        with self.model.use_params(params):
@ -193,7 +216,7 @@ class Pipe(object):
        def load_model(b):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
-                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+                self.cfg["pretrained_vectors"] = self.vocab.vectors
            if self.model is True:
                self.model = self.Model(**self.cfg)
            try:
@ -226,7 +249,7 @@ class Pipe(object):
        def load_model(p):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
-                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+                self.cfg["pretrained_vectors"] = self.vocab.vectors
            if self.model is True:
                self.model = self.Model(**self.cfg)
            try:
@ -254,10 +277,10 @@ class Tensorizer(Pipe):
        width (int): Output size of the model.
        embed_size (int): Number of vectors in the embedding table.
        **cfg: Config parameters.
-        RETURNS (Model): A `thinc.neural.Model` or similar instance.
+        RETURNS (Model): A `thinc.model.Model` or similar instance.
        """
        input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
-        return zero_init(Affine(output_size, input_size, drop_factor=0.0))
+        return Linear(output_size, input_size, init_W=zero_init)

    def __init__(self, vocab, model=True, **cfg):
        """Construct a new statistical model. Weights are not allocated on
@ -277,7 +300,6 @@ class Tensorizer(Pipe):
        self.model = model
        self.input_models = []
        self.cfg = dict(cfg)
-        self.cfg.setdefault("cnn_maxout_pieces", 3)

    def __call__(self, example):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -337,7 +359,7 @@ class Tensorizer(Pipe):
                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
            doc.tensor = tensor

-    def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
        """Update the model.

        docs (iterable): A batch of `Doc` objects.
@ -350,17 +372,23 @@ class Tensorizer(Pipe):
        examples = Example.to_example_objects(examples)
        inputs = []
        bp_inputs = []
+        set_dropout_rate(self.model, drop)
        for tok2vec in self.input_models:
-            tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples], drop=drop)
+            set_dropout_rate(tok2vec, drop)
+            tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
            inputs.append(tensor)
            bp_inputs.append(bp_tensor)
        inputs = self.model.ops.xp.hstack(inputs)
-        scores, bp_scores = self.model.begin_update(inputs, drop=drop)
+        scores, bp_scores = self.model.begin_update(inputs)
        loss, d_scores = self.get_loss(examples, scores)
        d_inputs = bp_scores(d_scores, sgd=sgd)
        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
        for d_input, bp_input in zip(d_inputs, bp_inputs):
-            bp_input(d_input, sgd=sgd)
+            bp_input(d_input)
+        if sgd is not None:
+            for tok2vec in self.input_models:
+                tok2vec.finish_update(sgd)
+            self.model.finish_update(sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
@ -387,6 +415,7 @@ class Tensorizer(Pipe):
                    self.input_models.append(model.tok2vec)
        if self.model is True:
            self.model = self.Model(**self.cfg)
+        self.model.initialize()
        link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
@ -405,7 +434,6 @@ class Tagger(Pipe):
        self.model = model
        self._rehearsal_model = None
        self.cfg = dict(sorted(cfg.items()))
-        self.cfg.setdefault("cnn_maxout_pieces", 2)

    @property
    def labels(self):
@ -416,12 +444,12 @@ class Tagger(Pipe):
        if self.model in (None, True, False):
            return None
        else:
-            return chain(self.model.tok2vec, flatten)
+            return chain(self.model.get_ref("tok2vec"), list2array())

    def __call__(self, example):
        doc = self._get_doc(example)
-        tags, tokvecs = self.predict([doc])
-        self.set_annotations([doc], tags, tensors=tokvecs)
+        tags = self.predict([doc])
+        self.set_annotations([doc], tags)
        if isinstance(example, Example):
            example.doc = doc
            return example
@ -430,8 +458,10 @@ class Tagger(Pipe):
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
        for examples in util.minibatch(stream, size=batch_size):
            docs = [self._get_doc(ex) for ex in examples]
-            tag_ids, tokvecs = self.predict(docs)
-            self.set_annotations(docs, tag_ids, tensors=tokvecs)
+            tag_ids = self.predict(docs)
+            assert len(docs) == len(examples)
+            assert len(tag_ids) == len(examples)
+            self.set_annotations(docs, tag_ids)

            if as_example:
                annotated_examples = []
@ -447,20 +477,25 @@ class Tagger(Pipe):
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.labels)
-            guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
-            tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
-            return guesses, tokvecs
-        tokvecs = self.model.tok2vec(docs)
-        scores = self.model.softmax(tokvecs)
+            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
+            assert len(guesses) == len(docs)
+            return guesses
+        scores = self.model.predict(docs)
+        assert len(scores) == len(docs), (len(scores), len(docs))
+        guesses = self._scores2guesses(scores)
+        assert len(guesses) == len(docs)
+        return guesses
+
+    def _scores2guesses(self, scores):
        guesses = []
        for doc_scores in scores:
            doc_guesses = doc_scores.argmax(axis=1)
            if not isinstance(doc_guesses, numpy.ndarray):
                doc_guesses = doc_guesses.get()
            guesses.append(doc_guesses)
-        return guesses, tokvecs
+        return guesses

-    def set_annotations(self, docs, batch_tag_ids, tensors=None):
+    def set_annotations(self, docs, batch_tag_ids):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
@ -483,15 +518,9 @@ class Tagger(Pipe):
                    else:
                        doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
                idx += 1
-            if tensors is not None and len(tensors):
-                if isinstance(doc.tensor, numpy.ndarray) \
-                and not isinstance(tensors[i], numpy.ndarray):
-                    doc.extend_tensor(tensors[i].get())
-                else:
-                    doc.extend_tensor(tensors[i])
            doc.is_tagged = True

-    def update(self, examples, drop=0., sgd=None, losses=None):
+    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
        self.require_model()
        examples = Example.to_example_objects(examples)
        if losses is not None and self.name not in losses:
@ -500,13 +529,18 @@ class Tagger(Pipe):
        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
            # Handle cases where there are no tokens in any docs.
            return
-
-        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+        set_dropout_rate(self.model, drop)
+        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
-        bp_tag_scores(d_tag_scores, sgd=sgd)
+        bp_tag_scores(d_tag_scores)
+        if sgd not in (None, False):
+            self.model.finish_update(sgd)

        if losses is not None:
            losses[self.name] += loss
+        if set_annotations:
+            docs = [ex.doc for ex in examples]
+            self.set_annotations(docs, self._scores2guesses(tag_scores))

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        """Perform a 'rehearsal' update, where we try to match the output of
@ -519,10 +553,12 @@ class Tagger(Pipe):
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
-        guesses, backprop = self.model.begin_update(docs, drop=drop)
+        set_dropout_rate(self.model, drop)
+        guesses, backprop = self.model.begin_update(docs)
        target = self._rehearsal_model(examples)
        gradient = guesses - target
-        backprop(gradient, sgd=sgd)
+        backprop(gradient)
+        self.model.finish_update(sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += (gradient**2).sum()
@ -546,7 +582,7 @@ class Tagger(Pipe):
                    known_labels[idx] = 0.
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
        docs = [ex.doc for ex in examples]
@ -566,6 +602,7 @@ class Tagger(Pipe):
                    new_tag_map[tag] = orig_tag_map[tag]
                else:
                    new_tag_map[tag] = {POS: X}
+
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
@ -577,16 +614,39 @@ class Tagger(Pipe):
                if hp in kwargs:
                    self.cfg[hp] = kwargs[hp]
            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+        # Get batch of example docs, example outputs to call begin_training().
+        # This lets the model infer shapes.
+        n_tags = self.vocab.morphology.n_tags
+        for node in self.model.walk():
+            # TODO: softmax hack ?
+            if node.name == "softmax" and node.has_dim("nO") is None:
+                node.set_dim("nO", n_tags)
        link_vectors_to_models(self.vocab)
+        self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

    @classmethod
-    def Model(cls, n_tags, **cfg):
+    def Model(cls, n_tags=None, **cfg):
        if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"):
            raise ValueError(TempErrors.T008)
-        return build_tagger_model(n_tags, **cfg)
+        if "tok2vec" in cfg:
+            tok2vec = cfg["tok2vec"]
+        else:
+            config = {
+                "width": cfg.get("token_vector_width", 96),
+                "embed_size": cfg.get("embed_size", 2000),
+                "pretrained_vectors": cfg.get("pretrained_vectors", None),
+                "window_size": cfg.get("window_size", 1),
+                "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
+                "subword_features": cfg.get("subword_features", True),
+                "char_embed": cfg.get("char_embed", False),
+                "conv_depth": cfg.get("conv_depth", 4),
+                "bilstm_depth": cfg.get("bilstm_depth", 0),
+            }
+            tok2vec = Tok2Vec(**config)
+        return build_tagger_model(n_tags, tok2vec)

    def add_label(self, label, values=None):
        if not isinstance(label, str):
@ -633,12 +693,12 @@ class Tagger(Pipe):
        def load_model(b):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
-                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+                self.cfg["pretrained_vectors"] = self.vocab.vectors
            if self.model is True:
                token_vector_width = util.env_opt(
                    "token_vector_width",
                    self.cfg.get("token_vector_width", 96))
-                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+                self.model = self.Model(**self.cfg)
            try:
                self.model.from_bytes(b)
            except AttributeError:
@ -676,9 +736,9 @@ class Tagger(Pipe):
        def load_model(p):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
-                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+                self.cfg["pretrained_vectors"] = self.vocab.vectors
            if self.model is True:
-                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+                self.model = self.Model(**self.cfg)
            with p.open("rb") as file_:
                try:
                    self.model.from_bytes(file_.read())
@ -753,10 +813,12 @@ class SentenceRecognizer(Tagger):
        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
            # Handle cases where there are no tokens in any docs.
            return
-
-        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+        set_dropout_rate(self.model, drop)
+        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
-        bp_tag_scores(d_tag_scores, sgd=sgd)
+        bp_tag_scores(d_tag_scores)
+        if sgd is not None:
+            self.model.finish_update(sgd)

        if losses is not None:
            losses[self.name] += loss
@ -780,7 +842,7 @@ class SentenceRecognizer(Tagger):
                    known_labels[idx] = 0.
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
        docs = [ex.doc for ex in examples]
@ -797,6 +859,7 @@ class SentenceRecognizer(Tagger):
            self.model = self.Model(len(self.labels), **self.cfg)
        if sgd is None:
            sgd = self.create_optimizer()
+        self.model.initialize()
        return sgd

    @classmethod
@ -918,6 +981,7 @@ class MultitaskObjective(Tagger):
            token_vector_width = util.env_opt("token_vector_width")
            self.model = self.Model(len(self.labels), tok2vec=tok2vec)
        link_vectors_to_models(self.vocab)
+        self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -925,14 +989,12 @@ class MultitaskObjective(Tagger):
    @classmethod
    def Model(cls, n_tags, tok2vec=None, **cfg):
        token_vector_width = util.env_opt("token_vector_width", 96)
-        softmax = Softmax(n_tags, token_vector_width*2)
        model = chain(
            tok2vec,
-            LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
-            softmax
+            Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0),
+            LayerNorm(token_vector_width*2),
+            Softmax(nO=n_tags, nI=token_vector_width*2)
        )
-        model.tok2vec = tok2vec
-        model.softmax = softmax
        return model

    def predict(self, docs):
@ -958,7 +1020,7 @@ class MultitaskObjective(Tagger):
                    correct[idx] = self.labels[label]
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        loss = (d_scores**2).sum()
        return float(loss), d_scores

@ -1047,19 +1109,18 @@ class ClozeMultitask(Pipe):
    def Model(cls, vocab, tok2vec, **cfg):
        output_size = vocab.vectors.data.shape[1]
        output_layer = chain(
-            LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
-            zero_init(Affine(output_size, output_size, drop_factor=0.0))
+            Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0),
+            Linear(nO=output_size, nI=output_size, init_W=zero_init)
        )
        model = chain(tok2vec, output_layer)
        model = masked_language_model(vocab, model)
-        model.tok2vec = tok2vec
-        model.output_layer = output_layer
        return model

    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = cfg
+        self.distance = CosineDistance(ignore_zeros=True, normalize=False)

    def set_annotations(self, docs, dep_ids, tensors=None):
        pass
@ -1069,7 +1130,8 @@ class ClozeMultitask(Pipe):
        link_vectors_to_models(self.vocab)
        if self.model is True:
            self.model = self.Model(self.vocab, tok2vec)
-        X = self.model.ops.allocate((5, self.model.tok2vec.nO))
+        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
+        self.model.initialize()
        self.model.output_layer.begin_training(X)
        if sgd is None:
            sgd = self.create_optimizer()
@ -1088,10 +1150,11 @@ class ClozeMultitask(Pipe):
        # and look them up all at once. This prevents data copying.
        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
        target = vectors[ids]
-        loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
-        return float(loss), gradient
+        gradient = self.distance.get_grad(prediction, target)
+        loss = self.distance.get_loss(prediction, target)
+        return loss, gradient

-    def update(self, examples, drop=0., sgd=None, losses=None):
+    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
        pass

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
@ -1099,9 +1162,12 @@ class ClozeMultitask(Pipe):
        examples = Example.to_example_objects(examples)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
-        predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+        set_dropout_rate(self.model, drop)
+        predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions, sgd=sgd)
+        bp_predictions(d_predictions)
+        if sgd is not None:
+            self.model.finish_update(sgd)

        if losses is not None:
            losses[self.name] += loss
@ -1115,19 +1181,45 @@ class TextCategorizer(Pipe):
    """

    @classmethod
-    def Model(cls, nr_class=1, **cfg):
-        embed_size = util.env_opt("embed_size", 2000)
-        if "token_vector_width" in cfg:
-            token_vector_width = cfg["token_vector_width"]
+    def Model(cls, nr_class=1, exclusive_classes=None, **cfg):
+        if nr_class == 1:
+            exclusive_classes = False
+        if exclusive_classes is None:
+            raise ValueError(
+                "TextCategorizer Model must specify 'exclusive_classes'. "
+                "This setting determines whether the model will output "
+                "scores that sum to 1 for each example. If only one class "
+                "is true for each example, you should set exclusive_classes=True. "
+                "For 'multi_label' classification, set exclusive_classes=False."
+            )
+        if "embed_size" not in cfg:
+            cfg["embed_size"] = util.env_opt("embed_size", 2000)
+        if "token_vector_width" not in cfg:
+            cfg["token_vector_width"] = util.env_opt("token_vector_width", 96)
+        if cfg.get("architecture") == "bow":
+            return build_bow_text_classifier(nr_class, exclusive_classes, **cfg)
        else:
-            token_vector_width = util.env_opt("token_vector_width", 96)
-        if cfg.get("architecture") == "simple_cnn":
-            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
-            return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
-        elif cfg.get("architecture") == "bow":
-            return build_bow_text_classifier(nr_class, **cfg)
-        else:
-            return build_text_classifier(nr_class, **cfg)
+            if "tok2vec" in cfg:
+                tok2vec = cfg["tok2vec"]
+            else:
+                config = {
+                    "width": cfg.get("token_vector_width", 96),
+                    "embed_size": cfg.get("embed_size", 2000),
+                    "pretrained_vectors": cfg.get("pretrained_vectors", None),
+                    "window_size": cfg.get("window_size", 1),
+                    "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
+                    "subword_features": cfg.get("subword_features", True),
+                    "char_embed": cfg.get("char_embed", False),
+                    "conv_depth": cfg.get("conv_depth", 4),
+                    "bilstm_depth": cfg.get("bilstm_depth", 0),
+                }
+                tok2vec = Tok2Vec(**config)
+                return build_simple_cnn_text_classifier(
+                    tok2vec,
+                    nr_class,
+                    exclusive_classes,
+                    **cfg
+                )

    @property
    def tok2vec(self):
@ -1141,6 +1233,8 @@ class TextCategorizer(Pipe):
        self.model = model
        self._rehearsal_model = None
        self.cfg = dict(cfg)
+        if "exclusive_classes" not in cfg:
+            self.cfg["exclusive_classes"] = True

    @property
    def labels(self):
@ -1180,7 +1274,7 @@ class TextCategorizer(Pipe):
            scores = xp.zeros((len(docs), len(self.labels)))
            return scores, tensors

-        scores = self.model(docs)
+        scores = self.model.predict(docs)
        scores = self.model.ops.asarray(scores)
        return scores, tensors

@ -1189,18 +1283,24 @@ class TextCategorizer(Pipe):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])

-    def update(self, examples, state=None, drop=0., sgd=None, losses=None):
+    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
        self.require_model()
        examples = Example.to_example_objects(examples)
        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
            # Handle cases where there are no tokens in any docs.
            return
-        scores, bp_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+        set_dropout_rate(self.model, drop)
+        scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
        loss, d_scores = self.get_loss(examples, scores)
-        bp_scores(d_scores, sgd=sgd)
+        bp_scores(d_scores)
+        if sgd is not None:
+            self.model.finish_update(sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
+        if set_annotations:
+            docs = [ex.doc for ex in examples]
+            self.set_annotations(docs, scores=scores)

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
@ -1210,10 +1310,13 @@ class TextCategorizer(Pipe):
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        set_dropout_rate(self.model, drop)
+        scores, bp_scores = self.model.begin_update(docs)
        target = self._rehearsal_model(examples)
        gradient = scores - target
-        bp_scores(gradient, sgd=sgd)
+        bp_scores(gradient)
+        if sgd is not None:
+            self.model.finish_update(sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += (gradient**2).sum()
@ -1247,7 +1350,7 @@ class TextCategorizer(Pipe):
            # - a huge problem.
            raise ValueError(Errors.E116)
            # smaller = self.model._layers[-1]
-            # larger = Affine(len(self.labels)+1, smaller.nI)
+            # larger = Linear(len(self.labels)+1, smaller.nI)
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
@ -1259,12 +1362,15 @@ class TextCategorizer(Pipe):
            for cat in example.doc_annotation.cats:
                self.add_label(cat)
        if self.model is True:
-            self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
+            self.cfg.update(kwargs)
            self.require_labels()
            self.model = self.Model(len(self.labels), **self.cfg)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
+        # TODO: use get_examples instead
+        docs = [Doc(Vocab(), words=["hello"])]
+        self.model.initialize(X=docs)
        return sgd


@ -1382,6 +1488,7 @@ class EntityLinker(Pipe):
        self.model = True
        self.kb = None
        self.cfg = dict(cfg)
+        self.distance = CosineDistance(normalize=False)

    def set_kb(self, kb):
        self.kb = kb
@ -1399,16 +1506,14 @@ class EntityLinker(Pipe):
    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
        self.require_kb()
        self.cfg["entity_width"] = self.kb.entity_vector_length
-
        if self.model is True:
            self.model = self.Model(**self.cfg)
-
+        self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
-
        return sgd

-    def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
        self.require_model()
        self.require_kb()
        if losses is not None:
@ -1416,9 +1521,12 @@ class EntityLinker(Pipe):
        if not examples:
            return 0
        examples = Example.to_example_objects(examples)
-
        sentence_docs = []
        docs = [ex.doc for ex in examples]
+        if set_annotations:
+            # This seems simpler than other ways to get that exact output -- but
+            # it does run the model twice :(
+            predictions = self.model.predict(docs)
        golds = [ex.gold for ex in examples]

        for doc, gold in zip(docs, golds):
@ -1443,13 +1551,17 @@ class EntityLinker(Pipe):
                        except AttributeError:
                            # Catch the exception when ent.sent is None and provide a user-friendly warning
                            raise RuntimeError(Errors.E030)
-
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
+        set_dropout_rate(self.model, drop)
+        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
-        bp_context(d_scores, sgd=sgd)
+        bp_context(d_scores)
+        if sgd is not None:
+            self.model.finish_update(sgd)

        if losses is not None:
            losses[self.name] += loss
+        if set_annotations:
+            self.set_annotations(docs, predictions)
        return loss

    def get_similarity_loss(self, golds, scores):
@ -1467,7 +1579,8 @@ class EntityLinker(Pipe):
        if scores.shape != entity_encodings.shape:
            raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))

-        loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
+        gradients = self.distance.get_grad(scores, entity_encodings)
+        loss = self.distance.get_loss(scores, entity_encodings)
        loss = loss / len(entity_encodings)
        return loss, gradients

@ -1533,7 +1646,7 @@ class EntityLinker(Pipe):
                for sent in doc.sents:
                    sent_doc = sent.as_doc()
                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    sentence_encoding = self.model([sent_doc])[0]
+                    sentence_encoding = self.model.predict([sent_doc])[0]
                    xp = get_array_module(sentence_encoding)
                    sentence_encoding_t = sentence_encoding.T
                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
@ -1720,7 +1833,6 @@ class Sentencizer(Pipe):
                self.set_annotations(docs, scores, tensors=tensors)
            else:
                self.set_annotations(docs, predictions)
-
            if as_example:
                annotated_examples = []
                for ex, doc in zip(examples, docs):
@ -1729,7 +1841,7 @@ class Sentencizer(Pipe):
                yield from annotated_examples
            else:
                yield from docs
-
+    
    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without
        modifying them.
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -0,0 +1,188 @@
+from .pipes import Pipe
+from ..gold import Example
+from ..tokens import Doc
+from ..vocab import Vocab
+from ..language import component
+from ..util import link_vectors_to_models, minibatch, registry, eg2doc
+
+from thinc.model import Model, set_dropout_rate
+
+
+@component("tok2vec", assigns=["doc.tensor"])
+class Tok2Vec(Pipe):
+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp.vocab, **cfg)
+
+    @classmethod
+    def Model(cls, architecture, **cfg):
+        """Create a new statistical model for the class.
+
+        architecture (str): The registered model architecture to use.
+        **cfg: Config parameters.
+        RETURNS (Model): A `thinc.model.Model` or similar instance.
+        """
+        model = registry.architectures.get(architecture)
+        return model(**cfg)
+
+    def __init__(self, vocab, model=True, **cfg):
+        """Construct a new statistical model. Weights are not allocated on
+        initialisation.
+        vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
+            instance with the `Doc` objects it will process.
+        model (Model): A `Model` instance or `True` to allocate one later.
+        **cfg: Config parameters.
+        """
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+        self.listeners = []
+
+    def create_listener(self):
+        listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
+        self.listeners.append(listener)
+
+    def add_listener(self, listener):
+        self.listeners.append(listener)
+
+    def find_listeners(self, model):
+        for node in model.walk():
+            if isinstance(node, Tok2VecListener) and node.upstream_name == self.name:
+                self.add_listener(node)
+
+    def __call__(self, doc):
+        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
+        model. Vectors are set to the `Doc.tensor` attribute.
+        docs (Doc or iterable): One or more documents to add vectors to.
+        RETURNS (dict or None): Intermediate computations.
+        """
+        tokvecses = self.predict([doc])
+        self.set_annotations([doc], tokvecses)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+        """Process `Doc` objects as a stream.
+        stream (iterator): A sequence of `Doc` objects to process.
+        batch_size (int): Number of `Doc` objects to group.
+        n_threads (int): Number of threads.
+        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
+        """
+        for batch in minibatch(stream, batch_size):
+            batch = list(batch)
+            if as_example:
+                docs = [eg2doc(doc) for doc in batch]
+            else:
+                docs = batch
+            tokvecses = self.predict(docs)
+            self.set_annotations(docs, tokvecses)
+            yield from batch
+
+    def predict(self, docs):
+        """Return a single tensor for a batch of documents.
+        docs (iterable): A sequence of `Doc` objects.
+        RETURNS (object): Vector representations for each token in the documents.
+        """
+        tokvecs = self.model.predict(docs)
+        batch_id = Tok2VecListener.get_batch_id(docs)
+        for listener in self.listeners:
+            listener.receive(batch_id, tokvecs, None)
+        return tokvecs
+
+    def set_annotations(self, docs, tokvecses):
+        """Set the tensor attribute for a batch of documents.
+        docs (iterable): A sequence of `Doc` objects.
+        tokvecs (object): Vector representation for each token in the documents.
+        """
+        for doc, tokvecs in zip(docs, tokvecses):
+            assert tokvecs.shape[0] == len(doc)
+            doc.tensor = tokvecs
+
+    def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False):
+        """Update the model.
+        examples (iterable): A batch of examples
+        drop (float): The droput rate.
+        sgd (callable): An optimizer.
+        RETURNS (dict): Results from the update.
+        """
+        if losses is None:
+            losses = {}
+        examples = Example.to_example_objects(examples)
+        docs = [eg.doc for eg in examples]
+        if isinstance(docs, Doc):
+            docs = [docs]
+        set_dropout_rate(self.model, drop)
+        tokvecs, bp_tokvecs = self.model.begin_update(docs)
+        
+        def capture_losses(d_tokvecs):
+            """Accumulate tok2vec loss before doing backprop."""
+            l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
+            if self.name in losses:
+                losses[self.name] += l2_loss / len(d_tokvecs)
+            else:
+                losses[self.name] = l2_loss / len(d_tokvecs)
+            return bp_tokvecs(d_tokvecs)
+
+        batch_id = Tok2VecListener.get_batch_id(docs)
+        for listener in self.listeners:
+            listener.receive(batch_id, tokvecs, capture_losses)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+        if set_annotations:
+            self.set_annotations(docs, tokvecs)
+
+    def get_loss(self, docs, golds, scores):
+        pass
+
+    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
+        """Allocate models and pre-process training data
+
+        get_examples (function): Function returning example training data.
+        pipeline (list): The pipeline the model is part of.
+        """
+        if self.model is True:
+            self.model = self.Model(**self.cfg)
+        # TODO: use examples instead ?
+        docs = [Doc(Vocab(), words=["hello"])]
+        self.model.initialize(X=docs)
+        link_vectors_to_models(self.vocab)
+
+
+class Tok2VecListener(Model):
+    """A layer that gets fed its answers from an upstream connection,
+    for instance from a component earlier in the pipeline.
+    """
+    name = "tok2vec-listener"
+
+    def __init__(self, upstream_name, width):
+        Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
+        self.upstream_name = upstream_name
+        self._batch_id = None
+        self._outputs = None
+        self._backprop = None
+
+    @classmethod
+    def get_batch_id(cls, inputs):
+        return sum(sum(token.orth for token in doc) for doc in inputs)
+
+    def receive(self, batch_id, outputs, backprop):
+        self._batch_id = batch_id
+        self._outputs = outputs
+        self._backprop = backprop
+
+    def verify_inputs(self, inputs):
+        if self._batch_id is None and self._outputs is None:
+            raise ValueError
+        else:
+            batch_id = self.get_batch_id(inputs)
+            if batch_id != self._batch_id:
+                raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}")
+            else:
+                return True
+
+
+def forward(model: Tok2VecListener, inputs, is_train):
+    if is_train:
+        model.verify_inputs(inputs)
+        return model._outputs, model._backprop
+    else:
+        return [doc.tensor for doc in inputs], lambda dX: []
--- a/spacy/syntax/_beam_utils.pxd
+++ b/spacy/syntax/_beam_utils.pxd
@ -1,4 +1,4 @@
-from thinc.typedefs cimport class_t, hash_t
+from ..typedefs cimport hash_t, class_t

 # These are passed as callbacks to thinc.search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -5,9 +5,9 @@ import numpy
 from cpython.ref cimport PyObject, Py_XDECREF
 from thinc.extra.search cimport Beam
 from thinc.extra.search import MaxViolation
-from thinc.typedefs cimport hash_t, class_t
 from thinc.extra.search cimport MaxViolation

+from ..typedefs cimport hash_t, class_t
 from .transition_system cimport TransitionSystem, Transition
 from ..gold cimport GoldParse
 from ..errors import Errors
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -1,6 +1,6 @@
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
-from thinc.typedefs cimport weight_t, class_t, hash_t
+from ..typedefs cimport weight_t, class_t, hash_t

 from ._state cimport StateC

--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -10,18 +10,14 @@ from libcpp.vector cimport vector
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
 from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t, class_t, hash_t
 from thinc.extra.search cimport Beam
-from thinc.api import chain, clone
-from thinc.v2v import Model, Maxout, Affine
-from thinc.misc import LayerNorm
-from thinc.neural.ops import CupyOps, NumpyOps
-from thinc.neural.util import get_array_module
-from thinc.linalg cimport Vec, VecVec
+from thinc.layers import Linear
+from thinc.model import Model
+from thinc.backends import CupyOps, NumpyOps, use_ops
+from thinc.backends.linalg cimport Vec, VecVec
 cimport blis.cy

-from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
-from .._ml import link_vectors_to_models, create_default_optimizer
+from ..typedefs cimport weight_t, class_t, hash_t
 from ..compat import copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
@ -31,6 +27,7 @@ from .stateclass cimport StateClass
 from .transition_system cimport Transition
 from . import _beam_utils
 from . import nonproj
+from ..util import link_vectors_to_models, create_default_optimizer


 cdef WeightsC get_c_weights(model) except *:
@ -44,8 +41,8 @@ cdef WeightsC get_c_weights(model) except *:
        output.hidden_weights = NULL
        output.hidden_bias = NULL
    else:
-        vec2scores_W = model.vec2scores.W
-        vec2scores_b = model.vec2scores.b
+        vec2scores_W = model.vec2scores.get_param("W")
+        vec2scores_b = model.vec2scores.get_param("b")
        output.hidden_weights = <const float*>vec2scores_W.data
        output.hidden_bias = <const float*>vec2scores_b.data
    cdef np.ndarray class_mask = model._class_mask
@ -57,12 +54,12 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
    cdef SizesC output
    output.states = batch_size
    if model.vec2scores is None:
-        output.classes = model.state2vec.nO
+        output.classes = model.state2vec.get_dim("nO")
    else:
-        output.classes = model.vec2scores.nO
-    output.hiddens = model.state2vec.nO
-    output.pieces = model.state2vec.nP
-    output.feats = model.state2vec.nF
+        output.classes = model.vec2scores.get_dim("nO")
+    output.hiddens = model.state2vec.get_dim("nO")
+    output.pieces = model.state2vec.get_dim("nP")
+    output.feats = model.state2vec.get_dim("nF")
    output.embed_width = model.tokvecs.shape[1]
    return output

@ -226,7 +223,7 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no

 class ParserModel(Model):
    def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
-        Model.__init__(self)
+        Model.__init__(self, name="parser_model", forward=forward)
        self._layers = [tok2vec, lower_model]
        if upper_model is not None:
            self._layers.append(upper_model)
@ -235,41 +232,47 @@ class ParserModel(Model):
            for class_ in unseen_classes:
                self.unseen_classes.add(class_)

-    def begin_update(self, docs, drop=0.):
-        step_model = ParserStepModel(docs, self._layers, drop=drop,
-                        unseen_classes=self.unseen_classes)
-        def finish_parser_update(golds, sgd=None):
-            step_model.make_updates(sgd)
-            return None
-        return step_model, finish_parser_update
+    def predict(self, docs):
+        step_model = ParserStepModel(docs, self._layers,
+                        unseen_classes=self.unseen_classes, train=False)
+        return step_model

-    def resize_output(self, new_output):
+    def resize_output(self, new_nO):
        if len(self._layers) == 2:
            return
-        if new_output == self.upper.nO:
+        if new_nO == self.upper.get_dim("nO"):
            return
        smaller = self.upper
-
-        with Model.use_device('cpu'):
-            larger = Affine(new_output, smaller.nI)
-        larger.W.fill(0.0)
-        larger.b.fill(0.0)
-        # It seems very unhappy if I pass these as smaller.W?
-        # Seems to segfault. Maybe it's a descriptor protocol thing?
-        smaller_W = smaller.W
-        larger_W = larger.W
-        smaller_b = smaller.b
-        larger_b = larger.b
+        nI = smaller.get_dim("nI")
+        with use_ops('numpy'):
+            larger = Linear(new_nO, nI)
+        larger_W = larger.ops.alloc2f(new_nO, nI)
+        larger_b = larger.ops.alloc1f(new_nO)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
        # Weights are stored in (nr_out, nr_in) format, so we're basically
        # just adding rows here.
-        larger_W[:smaller.nO] = smaller_W
-        larger_b[:smaller.nO] = smaller_b
+        larger_W[:smaller.get_dim("nO")] = smaller_W
+        larger_b[:smaller.get_dim("nO")] = smaller_b
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
        self._layers[-1] = larger
-        for i in range(smaller.nO, new_output):
+        for i in range(smaller.get_dim("nO"), new_nO):
            self.unseen_classes.add(i)

-    def begin_training(self, X, y=None):
-        self.lower.begin_training(X, y=y)
+    def initialize(self, X=None, Y=None):
+        self.tok2vec.initialize()
+        self.lower.initialize(X=X, Y=Y)
+        if self.upper is not None:
+            # In case we need to trigger the callbacks
+            statevecs = self.ops.alloc((2, self.lower.get_dim("nO")))
+            self.upper.initialize(X=statevecs)
+
+    def finish_update(self, optimizer):
+        self.tok2vec.finish_update(optimizer)
+        self.lower.finish_update(optimizer)
+        if self.upper is not None:
+            self.upper.finish_update(optimizer)

    @property
    def tok2vec(self):
@ -284,17 +287,25 @@ class ParserModel(Model):
        return self._layers[2]


+def forward(model:ParserModel, X, is_train):
+    step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes,
+        train=is_train)
+
+    return step_model, step_model.finish_steps
+
+
 class ParserStepModel(Model):
-    def __init__(self, docs, layers, unseen_classes=None, drop=0.):
-        self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
-        if layers[1].nP >= 2:
+    def __init__(self, docs, layers, unseen_classes=None, train=True):
+        Model.__init__(self, name="parser_step_model", forward=step_forward)
+        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
+        if layers[1].get_dim("nP") >= 2:
            activation = "maxout"
        elif len(layers) == 2:
            activation = None
        else:
            activation = "relu"
        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, drop=drop)
+                                            activation=activation, train=train)
        if len(layers) == 3:
            self.vec2scores = layers[-1]
        else:
@ -304,7 +315,7 @@ class ParserStepModel(Model):
        if self.vec2scores is None:
            self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
        else:
-            self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f')
+            self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f')
        self._class_mask.fill(1)
        if unseen_classes is not None:
            for class_ in unseen_classes:
@ -323,40 +334,6 @@ class ParserStepModel(Model):
    def mark_class_seen(self, class_):
        self._class_mask[class_] = 1

-    def begin_update(self, states, drop=0.):
-        token_ids = self.get_token_ids(states)
-        vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
-        if self.vec2scores is not None:
-            mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
-            if mask is not None:
-                vector *= mask
-            scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
-        else:
-            scores = NumpyOps().asarray(vector)
-            get_d_vector = lambda d_scores, sgd=None: d_scores
-            mask = None
-        # If the class is unseen, make sure its score is minimum
-        scores[:, self._class_mask == 0] = numpy.nanmin(scores)
-
-        def backprop_parser_step(d_scores, sgd=None):
-            # Zero vectors for unseen classes
-            d_scores *= self._class_mask
-            d_vector = get_d_vector(d_scores, sgd=sgd)
-            if mask is not None:
-                d_vector *= mask
-            if isinstance(self.state2vec.ops, CupyOps) \
-            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to GPU, asynchronously
-                self.backprops.append((
-                    util.get_async(self.cuda_stream, token_ids),
-                    util.get_async(self.cuda_stream, d_vector),
-                    get_d_tokvecs
-                ))
-            else:
-                self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-            return None
-        return scores, backprop_parser_step
-
    def get_token_ids(self, batch):
        states = _beam_utils.collect_states(batch)
        cdef StateClass state
@ -370,25 +347,56 @@ class ParserStepModel(Model):
            c_ids += ids.shape[1]
        return ids

-    def make_updates(self, sgd):
+    def finish_steps(self, golds):
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
-        d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
        # Tells CUDA to block, so our async copies complete.
        if self.cuda_stream is not None:
            self.cuda_stream.synchronize()
        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids), sgd=sgd)
+            d_state_features = bp_vector((d_vector, ids))
            ids = ids.flatten()
            d_state_features = d_state_features.reshape(
                (ids.size, d_state_features.shape[2]))
            self.ops.scatter_add(d_tokvecs, ids,
                d_state_features)
        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd)
+        if isinstance(self.ops, CupyOps):
+           d_tokvecs = self.ops.to_numpy(d_tokvecs)
+        self.bp_tokvecs(d_tokvecs[:-1])
        return d_tokvecs


+def step_forward(model: ParserStepModel, states, is_train):
+    token_ids = model.get_token_ids(states)
+    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    if model.vec2scores is not None:
+        scores, get_d_vector = model.vec2scores(vector, is_train)
+    else:
+        scores = NumpyOps().asarray(vector)
+        get_d_vector = lambda d_scores: d_scores
+    # If the class is unseen, make sure its score is minimum
+    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+
+    def backprop_parser_step(d_scores):
+        # Zero vectors for unseen classes
+        d_scores *= model._class_mask
+        d_vector = get_d_vector(d_scores)
+        if isinstance(model.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            model.backprops.append((
+                util.get_async(model.cuda_stream, token_ids),
+                util.get_async(model.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            model.backprops.append((token_ids, d_vector, get_d_tokvecs))
+        return None
+    return scores, backprop_parser_step
+
+
 cdef class precompute_hiddens:
    """Allow a model to be "primed" by pre-computing input features in bulk.

@ -406,7 +414,7 @@ cdef class precompute_hiddens:
    we can do all our hard maths up front, packed into large multiplications,
    and do the hard-to-program parsing on the CPU.
    """
-    cdef readonly int nF, nO, nP
+    cdef readonly int nF, nO, nP  # TODO: make these more like the dimensions in thinc
    cdef bint _is_synchronized
    cdef public object ops
    cdef np.ndarray _features
@ -417,8 +425,8 @@ cdef class precompute_hiddens:
    cdef object activation

    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", drop=0.):
-        gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
+                 activation="maxout", train=False):
+        gpu_cached, bp_features = lower_model(tokvecs, train)
        cdef np.ndarray cached
        if not isinstance(gpu_cached, numpy.ndarray):
            # Note the passing of cuda_stream here: it lets
@ -427,12 +435,16 @@ cdef class precompute_hiddens:
            cached = gpu_cached.get(stream=cuda_stream)
        else:
            cached = gpu_cached
-        if not isinstance(lower_model.b, numpy.ndarray):
-            self.bias = lower_model.b.get()
+        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
+            # self.bias = lower_model.get_param("b").get(stream=cuda_stream) ???
+            self.bias = lower_model.get_param("b")
        else:
-            self.bias = lower_model.b
+            self.bias = lower_model.get_param("b")
        self.nF = cached.shape[1]
-        self.nP = getattr(lower_model, 'nP', 1)
+        if lower_model.has_dim("nP"):
+            self.nP = lower_model.get_dim("nP")
+        else:
+            self.nP = 1
        self.nO = cached.shape[2]
        self.ops = lower_model.ops
        assert activation in (None, "relu", "maxout")
@ -448,10 +460,26 @@ cdef class precompute_hiddens:
            self._is_synchronized = True
        return <float*>self._cached.data

-    def __call__(self, X):
-        return self.begin_update(X, drop=None)[0]
+    def get_dim(self, name):
+        if name == "nF":
+            return self.nF
+        elif name == "nP":
+            return self.nP
+        elif name == "nO":
+            return self.nO
+        else:
+            raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")

-    def begin_update(self, token_ids, drop=0.):
+    def __call__(self, X, bint is_train):
+        if is_train:
+            return self.begin_update(X)
+        else:
+            return self.predict(X), lambda X: X
+
+    def predict(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids):
        cdef np.ndarray state_vector = numpy.zeros(
            (token_ids.shape[0], self.nO, self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
@ -466,13 +494,13 @@ cdef class precompute_hiddens:
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector += self.bias
+        state_vector = state_vector + self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

-        def backward(d_state_vector_ids, sgd=None):
+        def backward(d_state_vector_ids):
            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector, sgd)
-            d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
+            d_state_vector = bp_nonlinearity(d_state_vector)
+            d_tokens = bp_hiddens((d_state_vector, token_ids))
            return d_tokens
        return state_vector, backward

@ -492,7 +520,7 @@ cdef class precompute_hiddens:
            else:
                mask = None

-        def backprop_nonlinearity(d_best, sgd=None):
+        def backprop_nonlinearity(d_best):
            if isinstance(d_best, numpy.ndarray):
                ops = NumpyOps()
            else:
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -1,6 +1,6 @@
 from cymem.cymem cimport Pool

-from thinc.typedefs cimport weight_t
+from ..typedefs cimport weight_t

 from .stateclass cimport StateClass
 from ..typedefs cimport attr_t
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -1,7 +1,7 @@
-from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import Counter

+from ..typedefs cimport weight_t
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -1,5 +1,3 @@
-from thinc.typedefs cimport atom_t
-
 from .stateclass cimport StateClass
 from .arc_eager cimport TransitionSystem
 from ..vocab cimport Vocab
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -13,24 +13,23 @@ from libcpp.vector cimport vector
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free
 from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t, class_t, hash_t
 from thinc.extra.search cimport Beam
-from thinc.api import chain, clone
-from thinc.v2v import Model, Maxout, Affine
-from thinc.misc import LayerNorm
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module
-from thinc.linalg cimport Vec, VecVec
+from thinc.layers import chain, clone, Linear, list2array
+from thinc.backends import NumpyOps, CupyOps, use_ops
+from thinc.util import get_array_module
+from thinc.backends.linalg cimport Vec, VecVec
+from thinc.initializers import zero_init
+from thinc.model import set_dropout_rate
 import srsly

 from spacy.gold import Example
+from ..typedefs cimport weight_t, class_t, hash_t
 from ._parser_model cimport alloc_activations, free_activations
 from ._parser_model cimport predict_states, arg_max_if_valid
 from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ._parser_model cimport get_c_weights, get_c_sizes
 from ._parser_model import ParserModel
-from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
-from .._ml import link_vectors_to_models, create_default_optimizer
+from ..util import link_vectors_to_models, create_default_optimizer
 from ..compat import copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
@ -44,6 +43,10 @@ from . import _beam_utils
 from . import nonproj


+from ..ml._layers import PrecomputableAffine
+from ..ml.component_models import Tok2Vec
+
+
 cdef class Parser:
    """
    Base class of the DependencyParser and EntityRecognizer.
@ -54,7 +57,7 @@ cdef class Parser:
        subword_features = util.env_opt('subword_features',
                            cfg.get('subword_features', True))
        conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4))
-        conv_window = util.env_opt('conv_window', cfg.get('conv_depth', 1))
+        window_size = util.env_opt('window_size', cfg.get('window_size', 1))
        t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
        bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
        self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
@ -71,23 +74,23 @@ cdef class Parser:
            parser_maxout_pieces = 1
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
        pretrained_vectors = cfg.get('pretrained_vectors', None)
-        tok2vec = Tok2Vec(token_vector_width, embed_size,
+        tok2vec = Tok2Vec(width=token_vector_width,
+                          embed_size=embed_size,
                          conv_depth=conv_depth,
-                          conv_window=conv_window,
+                          window_size=window_size,
                          cnn_maxout_pieces=t2v_pieces,
                          subword_features=subword_features,
                          pretrained_vectors=pretrained_vectors,
                          bilstm_depth=bilstm_depth)
-        tok2vec = chain(tok2vec, flatten)
-        tok2vec.nO = token_vector_width
+        tok2vec = chain(tok2vec, list2array())
+        tok2vec.set_dim("nO", token_vector_width)
        lower = PrecomputableAffine(hidden_width,
                    nF=nr_feature_tokens, nI=token_vector_width,
                    nP=parser_maxout_pieces)
-        lower.nP = parser_maxout_pieces
+        lower.set_dim("nP", parser_maxout_pieces)
        if depth == 1:
-            with Model.use_device('cpu'):
-                upper = Affine(nr_class, hidden_width, drop_factor=0.0)
-            upper.W *= 0
+            with use_ops('numpy'):
+                upper = Linear(nr_class, hidden_width, init_W=zero_init)
        else:
            upper = None

@ -102,11 +105,13 @@ cdef class Parser:
            'bilstm_depth': bilstm_depth,
            'self_attn_depth': self_attn_depth,
            'conv_depth': conv_depth,
-            'conv_window': conv_window,
+            'window_size': window_size,
            'embed_size': embed_size,
            'cnn_maxout_pieces': t2v_pieces
        }
-        return ParserModel(tok2vec, lower, upper), cfg
+        model = ParserModel(tok2vec, lower, upper)
+        model.initialize()
+        return model, cfg

    name = 'base_parser'

@ -283,12 +288,13 @@ cdef class Parser:
    def greedy_parse(self, docs, drop=0.):
        cdef vector[StateC*] states
        cdef StateClass state
+        set_dropout_rate(self.model, drop)
        batch = self.moves.init_batch(docs)
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
        self._resize()
-        model = self.model(docs)
+        model = self.model.predict(docs)
        weights = get_c_weights(model)
        for state in batch:
            if not state.is_final():
@ -303,18 +309,19 @@ cdef class Parser:
        cdef Beam beam
        cdef Doc doc
        cdef np.ndarray token_ids
+        set_dropout_rate(self.model, drop)
        beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
        self._resize()
-        model = self.model(docs)
+        model = self.model.predict(docs)
        token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
                                 dtype='i', order='C')
        cdef int* c_ids
        cdef int nr_feature = self.cfg["nr_feature_tokens"]
        cdef int n_states
-        model = self.model(docs)
+        model = self.model.predict(docs)
        todo = [beam for beam in beams if not beam.is_done]
        while todo:
            token_ids.fill(-1)
@ -331,8 +338,8 @@ cdef class Parser:
                        n_states += 1
            if n_states == 0:
                break
-            vectors = model.state2vec(token_ids[:n_states])
-            scores = model.vec2scores(vectors)
+            vectors = model.state2vec.predict(token_ids[:n_states])
+            scores = model.vec2scores.predict(vectors)
            todo = self.transition_beams(todo, scores)
        return beams

@ -424,7 +431,7 @@ cdef class Parser:
            beam.check_done(_beam_utils.check_final_state, NULL)
        return [b for b in beams if not b.is_done]

-    def update(self, examples, drop=0., sgd=None, losses=None):
+    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
        self.require_model()
        examples = Example.to_example_objects(examples)

@ -438,8 +445,10 @@ cdef class Parser:
        beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
            return self.update_beam(examples, self.cfg.get('beam_width', 1),
-                    drop=drop, sgd=sgd, losses=losses,
+                    drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations,
                    beam_density=self.cfg.get('beam_density', 0.001))
+
+        set_dropout_rate(self.model, drop)
        # Chop sequences into lengths of this many transitions, to make the
        # batch uniform length.
        cut_gold = numpy.random.choice(range(20, 100))
@ -448,19 +457,24 @@ cdef class Parser:
                        if not s.is_final() and g is not None]

        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, finish_update = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+        model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples])
+        all_states = list(states)
        for _ in range(max_steps):
            if not states_golds:
                break
            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states, drop=drop)
+            scores, backprop = model.begin_update(states)
            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            backprop(d_scores, sgd=sgd)
+            backprop(d_scores)
            # Follow the predicted action
            self.transition_states(states, scores)
            states_golds = [eg for eg in states_golds if not eg[0].is_final()]
-        # Do the backprop
-        finish_update(golds, sgd=sgd)
+        backprop_tok2vec(golds)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+        if set_annotations:
+            docs = [ex.doc for ex in examples]
+            self.set_annotations(docs, all_states)
        return losses

    def rehearse(self, examples, sgd=None, losses=None, **cfg):
@ -482,13 +496,15 @@ cdef class Parser:
        # expand our model output.
        self._resize()
        # Prepare the stepwise model, and get the callback for finishing the batch
-        tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
-        model, finish_update = self.model.begin_update(docs, drop=0.0)
+        set_dropout_rate(self._rehearsal_model, 0.0)
+        set_dropout_rate(self.model, 0.0)
+        tutor, _ = self._rehearsal_model.begin_update(docs)
+        model, finish_update = self.model.begin_update(docs)
        n_scores = 0.
        loss = 0.
        while states:
-            targets, _ = tutor.begin_update(states, drop=0.)
-            guesses, backprop = model.begin_update(states, drop=0.)
+            targets, _ = tutor.begin_update(states)
+            guesses, backprop = model.begin_update(states)
            d_scores = (guesses - targets) / targets.shape[0]
            # If all weights for an output are 0 in the original model, don't
            # supervise that output. This allows us to add classes.
@ -499,12 +515,14 @@ cdef class Parser:
            states = [state for state in states if not state.is_final()]
            n_scores += d_scores.size
        # Do the backprop
-        finish_update(docs, sgd=sgd)
+        finish_update(docs)
+        if sgd is not None:
+            self.model.finish_update(sgd)
        losses[self.name] += loss / n_scores
        return losses

    def update_beam(self, examples, width, drop=0., sgd=None, losses=None,
-                    beam_density=0.0):
+                    set_annotations=False, beam_density=0.0):
        examples = Example.to_example_objects(examples)
        docs = [ex.doc for ex in examples]
        golds = [ex.gold for ex in examples]
@ -514,15 +532,16 @@ cdef class Parser:
        for gold in golds:
            self.moves.preprocess_gold(gold)
            new_golds.append(gold)
-        model, finish_update = self.model.begin_update(docs, drop=drop)
+        set_dropout_rate(self.model, drop)
+        model, backprop_tok2vec = self.model.begin_update(docs)
        states_d_scores, backprops, beams = _beam_utils.update_beam(
-            self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
-            model.vec2scores, width, drop=drop, losses=losses,
+            self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds,
+            model.state2vec, model.vec2scores, width, losses=losses,
            beam_density=beam_density)
        for i, d_scores in enumerate(states_d_scores):
            losses[self.name] += (d_scores**2).mean()
            ids, bp_vectors, bp_scores = backprops[i]
-            d_vector = bp_scores(d_scores, sgd=sgd)
+            d_vector = bp_scores(d_scores)
            if isinstance(model.ops, CupyOps) \
            and not isinstance(ids, model.state2vec.ops.xp.ndarray):
                model.backprops.append((
@ -531,11 +550,34 @@ cdef class Parser:
                    bp_vectors))
            else:
                model.backprops.append((ids, d_vector, bp_vectors))
-        model.make_updates(sgd)
+        backprop_tok2vec(golds)
+        if sgd is not None:
+            self.model.finish_update(sgd)
+        if set_annotations:
+            self.set_annotations(docs, beams)
        cdef Beam beam
        for beam in beams:
            _beam_utils.cleanup_beam(beam)

+    def get_gradients(self):
+        """Get non-zero gradients of the model's parameters, as a dictionary
+        keyed by the parameter ID. The values are (weights, gradients) tuples.
+        """
+        gradients = {}
+        if self.model in (None, True, False):
+            return gradients
+        queue = [self.model]
+        seen = set()
+        for node in queue:
+            if node.id in seen:
+                continue
+            seen.add(node.id)
+            if hasattr(node, "_mem") and node._mem.gradient.any():
+                gradients[node.id] = [node._mem.weights, node._mem.gradient]
+            if hasattr(node, "_layers"):
+                queue.extend(node._layers)
+        return gradients
+
    def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
        """Make a square batch, of length equal to the shortest doc. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
@ -605,8 +647,7 @@ cdef class Parser:
        return d_scores

    def create_optimizer(self):
-        return create_default_optimizer(self.model.ops,
-                                        **self.cfg.get('optimizer', {}))
+        return create_default_optimizer()

    def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg):
        if 'model' in cfg:
@ -636,14 +677,16 @@ cdef class Parser:
                for doc, gold in parses:
                    doc_sample.append(doc)
                    gold_sample.append(gold)
-            self.model.begin_training(doc_sample, gold_sample)
+            self.model.initialize(doc_sample, gold_sample)
            if pipeline is not None:
                self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg)
            link_vectors_to_models(self.vocab)
        else:
            if sgd is None:
                sgd = self.create_optimizer()
-            self.model.begin_training([])
+            if self.model.upper.has_dim("nO") is None:
+                self.model.upper.set_dim("nO", self.moves.n_moves)
+            self.model.initialize()
        self.cfg.update(cfg)
        return sgd

@ -709,7 +752,7 @@ cdef class Parser:
        if 'model' not in exclude:
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
-                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
+                self.cfg['pretrained_vectors'] = self.vocab.vectors
            if self.model is True:
                self.model, cfg = self.Model(**self.cfg)
            else:
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -1,7 +1,6 @@
 from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t

-from ..typedefs cimport attr_t
+from ..typedefs cimport attr_t, weight_t
 from ..structs cimport TokenC
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -1,7 +1,7 @@
 # cython: infer_types=True
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t
+from ..typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import Counter
 import srsly
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -1,6 +1,6 @@
 import pytest
-from thinc.neural.optimizers import Adam
-from thinc.neural.ops import NumpyOps
+from thinc.optimizers import Adam
+from thinc.backends import NumpyOps
 from spacy.attrs import NORM
 from spacy.gold import GoldParse
 from spacy.vocab import Vocab
@ -28,7 +28,7 @@ def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
    parser.begin_training([], **parser.cfg)
-    sgd = Adam(NumpyOps(), 0.001)
+    sgd = Adam(0.001, ops=NumpyOps())

    for i in range(5):
        losses = {}
@ -41,8 +41,8 @@ def _train_parser(parser):
 def test_add_label(parser):
    parser = _train_parser(parser)
    parser.add_label("right")
-    sgd = Adam(NumpyOps(), 0.001)
-    for i in range(10):
+    sgd = Adam(0.001, ops=NumpyOps())
+    for i in range(100):
        losses = {}
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
        gold = GoldParse(
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -7,6 +7,11 @@ from spacy.syntax.ner import BiluoPushDown
 from spacy.gold import GoldParse
 from spacy.tokens import Doc

+TRAIN_DATA = [
+    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
+    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
+    ]
+

@pytest.fixture
 def vocab():
@ -263,7 +268,7 @@ def test_change_number_features():
    nlp.add_pipe(ner)
    ner.add_label("PERSON")
    nlp.begin_training()
-    assert ner.model.lower.nF == ner.nr_feature
+    assert ner.model.lower.get_dim("nF") == ner.nr_feature
    # Test we can change it
    nlp = English()
    ner = nlp.create_pipe("ner")
@ -272,11 +277,36 @@ def test_change_number_features():
    nlp.begin_training(
        component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
    )
-    assert ner.model.lower.nF == 3
+    assert ner.model.lower.get_dim("nF") == 3
    # Test the model runs
    nlp("hello world")


+def test_overfitting():
+    # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    for _, annotations in TRAIN_DATA:
+        for ent in annotations.get("entities"):
+            ner.add_label(ent[2])
+    nlp.add_pipe(ner)
+    optimizer = nlp.begin_training()
+
+    for i in range(50):
+        losses = {}
+        nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.00001
+
+    # test the trained model
+    test_text = "I like London."
+    doc = nlp(test_text)
+    ents = doc.ents
+
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+
+
 class BlockerComponent1(object):
    name = "my_blocker"

--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -1,5 +1,5 @@
 import pytest
-from spacy._ml import Tok2Vec
+from spacy.ml.component_models import Tok2Vec
 from spacy.vocab import Vocab
 from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.nn_parser import Parser
@ -20,7 +20,9 @@ def arc_eager(vocab):

@pytest.fixture
 def tok2vec():
-    return Tok2Vec(8, 100)
+    tok2vec = Tok2Vec(8, 100)
+    tok2vec.initialize()
+    return tok2vec


@pytest.fixture
@ -30,7 +32,7 @@ def parser(vocab, arc_eager):

@pytest.fixture
 def model(arc_eager, tok2vec):
-    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
+    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0]


@pytest.fixture
@ -53,7 +55,7 @@ def test_build_model(parser):


 def test_predict_doc(parser, tok2vec, model, doc):
-    doc.tensor = tok2vec([doc])[0]
+    doc.tensor = tok2vec.predict([doc])[0]
    parser.model = model
    parser(doc)

@ -61,8 +63,9 @@ def test_predict_doc(parser, tok2vec, model, doc):
 def test_update_doc(parser, model, doc, gold):
    parser.model = model

-    def optimize(weights, gradient, key=None):
+    def optimize(key, weights, gradient):
        weights -= 0.001 * gradient
+        return weights, gradient

    parser.update((doc, gold), sgd=optimize)

--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -1,7 +1,25 @@
 import pytest

+from spacy.lang.en import English
 from ..util import get_doc, apply_transition_sequence

+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+

 def test_parser_root(en_tokenizer):
    text = "i don't have other assistance"
@ -162,3 +180,27 @@ def test_parser_set_sent_starts(en_vocab):
    for sent in doc.sents:
        for token in sent:
            assert token.head in sent
+
+
+def test_overfitting():
+    # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
+    nlp = English()
+    parser = nlp.create_pipe("parser")
+    for _, annotations in TRAIN_DATA:
+        for dep in annotations.get("deps", []):
+            parser.add_label(dep)
+    nlp.add_pipe(parser)
+    optimizer = nlp.begin_training()
+
+    for i in range(50):
+        losses = {}
+        nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+    assert losses["parser"] < 0.00001
+
+    # test the trained model
+    test_text = "I like securities."
+    doc = nlp(test_text)
+
+    assert doc[0].dep_ is "nsubj"
+    assert doc[2].dep_ is "dobj"
+    assert doc[3].dep_ is "punct"
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -1,6 +1,6 @@
 import pytest
-from thinc.neural.optimizers import Adam
-from thinc.neural.ops import NumpyOps
+from thinc.optimizers import Adam
+from thinc.backends import NumpyOps
 from spacy.attrs import NORM
 from spacy.gold import GoldParse
 from spacy.vocab import Vocab
@ -21,7 +21,7 @@ def parser(vocab):
    # parser.add_label('right')
    parser.add_label("left")
    parser.begin_training([], **parser.cfg)
-    sgd = Adam(NumpyOps(), 0.001)
+    sgd = Adam(0.001)

    for i in range(10):
        losses = {}
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -1,4 +1,5 @@
 import pytest
+import srsly
 from spacy.language import Language


@ -8,3 +9,35 @@ def test_label_types():
    nlp.get_pipe("tagger").add_label("A")
    with pytest.raises(ValueError):
        nlp.get_pipe("tagger").add_label(9)
+
+
+TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+
+TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
+def test_overfitting():
+    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
+    nlp = Language()
+    tagger = nlp.create_pipe("tagger")
+    for tag, values in TAG_MAP.items():
+        tagger.add_label(tag, values)
+    nlp.add_pipe(tagger)
+    optimizer = nlp.begin_training()
+
+    for i in range(50):
+        losses = {}
+        nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+    assert losses["tagger"] < 0.00001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+
+    assert doc[0].tag_ is "N"
+    assert doc[1].tag_ is "V"
+    assert doc[2].tag_ is "J"
+    assert doc[3].tag_ is "N"
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -6,6 +6,11 @@ from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.gold import GoldParse

+TRAIN_DATA = [
+    ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
+    ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
+]
+

@pytest.mark.skip(reason="Test is flakey when run with others")
 def test_simple_train():
@ -67,3 +72,26 @@ def test_label_types():
    nlp.get_pipe("textcat").add_label("answer")
    with pytest.raises(ValueError):
        nlp.get_pipe("textcat").add_label(9)
+
+
+def test_overfitting():
+    # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
+    nlp = Language()
+    textcat = nlp.create_pipe("textcat")
+    for _, annotations in TRAIN_DATA:
+        for label, value in annotations.get("cats").items():
+            textcat.add_label(label)
+    nlp.add_pipe(textcat)
+    optimizer = nlp.begin_training()
+
+    for i in range(50):
+        losses = {}
+        nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+    assert losses["textcat"] < 0.00001
+
+    # test the trained model
+    test_text = "I am happy."
+    doc = nlp(test_text)
+    cats = doc.cats
+    assert cats["POSITIVE"] > 0.9
+    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -8,7 +8,7 @@ from spacy.matcher import Matcher
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.compat import pickle
-from spacy._ml import link_vectors_to_models
+from spacy.util import link_vectors_to_models
 import numpy
 import random

--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -32,7 +32,7 @@ def test_issue3611():

    # training the network
    with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.begin_training(X=x_train, Y=y_train)
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/tests/test_architectures.py
+++ b/spacy/tests/test_architectures.py
@ -1,12 +1,12 @@
 import pytest
 from spacy import registry
-from thinc.v2v import Affine
+from thinc.layers import Linear
 from catalogue import RegistryError


@registry.architectures.register("my_test_function")
 def create_model(nr_in, nr_out):
-    return Affine(nr_in, nr_out)
+    return Linear(nr_in, nr_out)


 def test_get_architecture():
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -5,7 +5,8 @@ from pathlib import Path
 from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.compat import symlink_to, symlink_remove, is_windows
-from spacy._ml import PrecomputableAffine
+from spacy.ml._layers import PrecomputableAffine
+from spacy.ml._layers import _backprop_precomputable_affine_padding
 from subprocess import CalledProcessError


@ -67,28 +68,30 @@ def test_util_get_package_path(package):

 def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
-    assert model.W.shape == (nF, nO, nP, nI)
-    tensor = model.ops.allocate((10, nI))
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
    Y, get_dX = model.begin_update(tensor)
    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    assert model.d_pad.shape == (1, nF, nO, nP)
-    dY = model.ops.allocate((15, nO, nP))
-    ids = model.ops.allocate((15, nF))
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
    ids[1, 2] = -1
    dY[1] = 1
-    assert model.d_pad[0, 2, 0, 0] == 0.0
-    model._backprop_padding(dY, ids)
-    assert model.d_pad[0, 2, 0, 0] == 1.0
-    model.d_pad.fill(0.0)
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
    ids.fill(0.0)
    dY.fill(0.0)
-    ids[1, 2] = -1
+    dY[0] = 0
+    ids[1, 2] = 0
    ids[1, 1] = -1
    ids[1, 0] = -1
    dY[1] = 1
-    assert model.d_pad[0, 2, 0, 0] == 0.0
-    model._backprop_padding(dY, ids)
-    assert model.d_pad[0, 2, 0, 0] == 3.0
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0


 def test_prefer_gpu():
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@ -1,6 +1,6 @@
 import pytest

-from spacy._ml import Tok2Vec
+from spacy.ml.component_models import Tok2Vec
 from spacy.vocab import Vocab
 from spacy.tokens import Doc

@ -10,7 +10,7 @@ def get_batch(batch_size):
    docs = []
    start = 0
    for size in range(1, batch_size + 1):
-        # Make the words numbers, so that they're distnct
+        # Make the words numbers, so that they're distinct
        # across the batch, and easy to track.
        numbers = [str(i) for i in range(start, start + size)]
        docs.append(Doc(vocab, words=numbers))
@ -37,6 +37,7 @@ def test_empty_doc():
 def test_tok2vec_batch_sizes(batch_size, width, embed_size):
    batch = get_batch(batch_size)
    tok2vec = Tok2Vec(width, embed_size)
+    tok2vec.initialize()
    vectors, backprop = tok2vec.begin_update(batch)
    assert len(vectors) == len(batch)
    for doc_vec, doc in zip(vectors, batch):
@ -56,6 +57,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
 def test_tok2vec_configs(tok2vec_config):
    docs = get_batch(3)
    tok2vec = Tok2Vec(**tok2vec_config)
+    tok2vec.initialize()
    vectors, backprop = tok2vec.begin_update(docs)
    assert len(vectors) == len(docs)
    assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"])
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -1,14 +1,13 @@
 import pytest
 import numpy
 from numpy.testing import assert_allclose
-from spacy._ml import cosine
 from spacy.vocab import Vocab
 from spacy.vectors import Vectors
 from spacy.tokenizer import Tokenizer
 from spacy.strings import hash_string
 from spacy.tokens import Doc

-from ..util import add_vecs_to_vocab
+from ..util import add_vecs_to_vocab, get_cosine


@pytest.fixture
@ -311,4 +310,4 @@ def test_vocab_prune_vectors():
    assert list(remap.keys()) == ["kitten"]
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
-    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
+    assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -4,7 +4,7 @@
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport malloc, free
 from cymem.cymem cimport Pool
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module

 import numpy

--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -1,7 +1,7 @@
 import numpy
 import zlib
 import srsly
-from thinc.neural.ops import NumpyOps
+from thinc.backends import NumpyOps

 from ..compat import copy_reg
 from ..tokens import Doc
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -11,7 +11,7 @@ import numpy
 import numpy.linalg
 import struct
 import srsly
-from thinc.neural.util import get_array_module, copy_array
+from thinc.util import get_array_module, copy_array

 from .span cimport Span
 from .token cimport Token
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -3,7 +3,7 @@ from libc.math cimport sqrt

 import numpy
 import numpy.linalg
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module
 from collections import defaultdict

 from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -7,7 +7,7 @@ cimport numpy as np
 np.import_array()

 import numpy
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module

 from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -2,7 +2,9 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
 from libc.stdint cimport uint8_t


+ctypedef float weight_t
 ctypedef uint64_t hash_t
+ctypedef uint64_t class_t
 ctypedef char* utf8_t
 ctypedef uint64_t attr_t
 ctypedef uint64_t flags_t
--- a/spacy/util.py
+++ b/spacy/util.py
@ -4,8 +4,14 @@ import importlib.util
 import re
 from pathlib import Path
 import random
-from thinc.neural._classes.model import Model
-from thinc.neural.ops import NumpyOps
+from typing import List
+
+import thinc
+import thinc.config
+from thinc.backends import NumpyOps, get_current_ops
+from thinc.optimizers import Adam
+from thinc.util import require_gpu
+
 import functools
 import itertools
 import numpy.random
@ -13,6 +19,7 @@ import srsly
 import catalogue
 import sys

+
 try:
    import cupy.random
 except ImportError:
@ -20,14 +27,13 @@ except ImportError:

 from .symbols import ORTH
 from .compat import cupy, CudaStream
-from .errors import Errors, Warnings, deprecation_warning
-
+from .errors import Errors, Warnings, deprecation_warning, user_warning

 _data_path = Path(__file__).parent / "data"
 _PRINT_ENV = False


-class registry(object):
+class registry(thinc.registry):
    languages = catalogue.create("spacy", "languages", entry_points=True)
    architectures = catalogue.create("spacy", "architectures", entry_points=True)
    lookups = catalogue.create("spacy", "lookups", entry_points=True)
@ -219,6 +225,23 @@ def load_model_from_init_py(init_file, **overrides):
    return load_model_from_path(data_path, meta, **overrides)


+def load_from_config(path, create_objects=False):
+    """Load a Thinc-formatted config file, optionally filling in objects where
+    the config references registry entries. See "Thinc config files" for details.
+
+    path (unicode or Path): Path to the config file
+    create_objects (bool): Whether to automatically create objects when the config
+        references registry entries. Defaults to False.
+
+    RETURNS (dict): The objects from the config file.
+    """
+    config = thinc.config.Config().from_disk(path)
+    if create_objects:
+        return registry.make_from_config(config, validate=True)
+    else:
+        return config
+
+
 def get_model_meta(path):
    """Get model meta.json from a directory path and validate its contents.

@ -293,9 +316,10 @@ def get_component_name(component):


 def get_cuda_stream(require=False, non_blocking=True):
+    ops = get_current_ops()
    if CudaStream is None:
        return None
-    elif isinstance(Model.ops, NumpyOps):
+    elif isinstance(ops, NumpyOps):
        return None
    else:
        return CudaStream(non_blocking=non_blocking)
@ -310,6 +334,14 @@ def get_async(stream, numpy_array):
        return array


+def eg2doc(example):
+    """Get a Doc object from an Example (or if it's a Doc, use it directly)"""
+    # Put the import here to avoid circular import problems
+    from .tokens.doc import Doc
+
+    return example if isinstance(example, Doc) else example.doc
+
+
 def env_opt(name, default=None):
    if type(default) is float:
        type_convert = float
@ -532,6 +564,8 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len):
    """Create minibatches of a given number of words."""
    if isinstance(size, int):
        size_ = itertools.repeat(size)
+    if isinstance(size, List):
+        size_ = iter(size)
    else:
        size_ = size
    examples = iter(examples)
@ -680,17 +714,7 @@ def escape_html(text):


 def use_gpu(gpu_id):
-    try:
-        import cupy.cuda.device
-    except ImportError:
-        return None
-    from thinc.neural.ops import CupyOps
-
-    device = cupy.cuda.device.Device(gpu_id)
-    device.use()
-    Model.ops = CupyOps()
-    Model.Ops = CupyOps
-    return device
+    return require_gpu(gpu_id)


 def fix_random_seed(seed=0):
@ -747,3 +771,33 @@ class DummyTokenizer(object):

    def from_disk(self, _path, **kwargs):
        return self
+
+
+def link_vectors_to_models(vocab):
+    vectors = vocab.vectors
+    if vectors.name is None:
+        vectors.name = VECTORS_KEY
+        if vectors.data.size != 0:
+            user_warning(Warnings.W020.format(shape=vectors.data.shape))
+    for word in vocab:
+        if word.orth in vectors.key2row:
+            word.rank = vectors.key2row[word.orth]
+        else:
+            word.rank = 0
+
+
+VECTORS_KEY = "spacy_pretrained_vectors"
+
+
+def create_default_optimizer():
+    ops = get_current_ops()
+    learn_rate = env_opt("learn_rate", 0.001)
+    beta1 = env_opt("optimizer_B1", 0.9)
+    beta2 = env_opt("optimizer_B2", 0.999)
+    eps = env_opt("optimizer_eps", 1e-8)
+    L2 = env_opt("L2_penalty", 1e-6)
+    max_grad_norm = env_opt("grad_norm_clip", 1.0)
+    optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops)
+    optimizer.max_grad_norm = max_grad_norm
+    optimizer.device = ops.device_type
+    return optimizer
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -5,8 +5,8 @@ from libcpp.set cimport set as cppset
 import functools
 import numpy
 import srsly
-from thinc.neural.util import get_array_module
-from thinc.neural._classes.model import Model
+from thinc.util import get_array_module
+from thinc.backends import get_current_ops

 from .strings cimport StringStore

@ -426,9 +426,9 @@ cdef class Vectors:
                    self.add(key, row=i)

        def load_vectors(path):
-            xp = Model.ops.xp
+            ops = get_current_ops()
            if path.exists():
-                self.data = xp.load(str(path))
+                self.data = ops.xp.load(str(path))

        serializers = {
            "key2row": load_key2row,
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -2,7 +2,7 @@
 from libc.string cimport memcpy

 import srsly
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module

 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
@ -16,7 +16,7 @@ from .errors import Errors
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs, NORM
 from .vectors import Vectors
-from ._ml import link_vectors_to_models
+from .util import link_vectors_to_models
 from .lookups import Lookups
 from . import util