diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg
index 6c3a21f4b..f76336d84 100644
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@@ -9,6 +9,7 @@ max_length = 0
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
+noise_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
@@ -24,8 +25,8 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-vectors = null
 discard_oversize = false
+omit_extra_lookups = false
 
 [training.batch_size]
 @schedules = "compounding.v1"
@@ -52,7 +53,7 @@ learn_rate = 0.001
 
 [nlp]
 lang = "en"
-vectors = ${training:vectors}
+vectors = null
 
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"
@@ -62,12 +63,20 @@ factory = "senter"
 
 [nlp.pipeline.ner]
 factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
 
 [nlp.pipeline.tagger]
 factory = "tagger"
 
 [nlp.pipeline.parser]
 factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
 
 [nlp.pipeline.senter.model]
 @architectures = "spacy.Tagger.v1"
diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg
index 4f1898d69..40885b6e8 100644
--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@@ -9,6 +9,7 @@ max_length = 0
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
+noise_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
@@ -24,7 +25,6 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-vectors = null
 discard_oversize = false
 
 [training.batch_size]
@@ -72,7 +72,7 @@ normalize = true
 
 [nlp]
 lang = "en"
-vectors = ${training:vectors}
+vectors = null
 
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"
@@ -82,12 +82,20 @@ factory = "senter"
 
 [nlp.pipeline.ner]
 factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
 
 [nlp.pipeline.tagger]
 factory = "tagger"
 
 [nlp.pipeline.parser]
 factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
 
 [nlp.pipeline.senter.model]
 @architectures = "spacy.Tagger.v1"
diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
index acbcc8d41..905b5b4e0 100644
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -6,6 +6,7 @@ init_tok2vec = null
 vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
+noise_level = 0.0
 gold_preproc = true
 max_length = 0
 use_gpu = 0
@@ -40,6 +41,10 @@ factory = "tagger"
 
 [nlp.pipeline.parser]
 factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
 
 [nlp.pipeline.tagger.model]
 @architectures = "spacy.Tagger.v1"
diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
index c305c015c..7383116e7 100644
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -6,6 +6,7 @@ init_tok2vec = null
 vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
+noise_level = 0.0
 gold_preproc = true
 max_length = 0
 use_gpu = -1
@@ -40,6 +41,10 @@ factory = "tagger"
 
 [nlp.pipeline.parser]
 factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
 
 [nlp.pipeline.tagger.model]
 @architectures = "spacy.Tagger.v1"
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 65acadb07..c5e679467 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -120,13 +120,22 @@ def load_data(dataset, threshold, limit=0, split=0.8):
     random.shuffle(train_data)
     texts, labels = zip(*train_data)
 
-    unique_labels = sorted(set([l for label_set in labels for l in label_set]))
+    unique_labels = set()
+    for label_set in labels:
+        if isinstance(label_set, int) or isinstance(label_set, str):
+            unique_labels.add(label_set)
+        elif isinstance(label_set, list) or isinstance(label_set, set):
+            unique_labels.update(label_set)
+    unique_labels = sorted(unique_labels)
     print(f"# of unique_labels: {len(unique_labels)}")
 
     count_values_train = dict()
     for text, annot_list in train_data:
-        for annot in annot_list:
-            count_values_train[annot] = count_values_train.get(annot, 0) + 1
+        if isinstance(annot_list, int) or isinstance(annot_list, str):
+            count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
+        else:
+            for annot in annot_list:
+                count_values_train[annot] = count_values_train.get(annot, 0) + 1
     for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
         if count < threshold:
             unique_labels.remove(value)
@@ -138,7 +147,7 @@ def load_data(dataset, threshold, limit=0, split=0.8):
     else:
         cats = []
         for y in labels:
-            if isinstance(y, str):
+            if isinstance(y, str) or isinstance(y, int):
                 cats.append({str(label): (label == y) for label in unique_labels})
             elif isinstance(y, set):
                 cats.append({str(label): (label in y) for label in unique_labels})
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 735e304f9..bae252b1c 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -54,7 +54,8 @@ def evaluate(
         "NER P": f"{scorer.ents_p:.2f}",
         "NER R": f"{scorer.ents_r:.2f}",
         "NER F": f"{scorer.ents_f:.2f}",
-        "Textcat": f"{scorer.textcat_score:.2f}",
+        "Textcat AUC": f"{scorer.textcat_auc:.2f}",
+        "Textcat F": f"{scorer.textcat_f:.2f}",
         "Sent P": f"{scorer.sent_p:.2f}",
         "Sent R": f"{scorer.sent_r:.2f}",
         "Sent F": f"{scorer.sent_f:.2f}",
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index d37426b5a..4f4707b52 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -266,17 +266,15 @@ def create_pretraining_model(nlp, tok2vec):
     the tok2vec input model. The tok2vec input model needs to be a model that
     takes a batch of Doc objects (as a list), and returns a list of arrays.
     Each array in the output needs to have one row per token in the doc.
+    The actual tok2vec layer is stored as a reference, and only this bit will be
+    serialized to file and read back in when calling the 'train' command.
     """
     output_size = nlp.vocab.vectors.data.shape[1]
     output_layer = chain(
         Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
     )
-    # This is annoying, but the parser etc have the flatten step after
-    # the tok2vec. To load the weights in cleanly, we need to match
-    # the shape of the models' components exactly. So what we cann
-    # "tok2vec" has to be the same set of processes as what the components do.
-    tok2vec = chain(tok2vec, list2array())
-    model = chain(tok2vec, output_layer)
+    model = chain(tok2vec, list2array())
+    model = chain(model, output_layer)
     model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
     mlm_model = build_masked_language_model(nlp.vocab, model)
     mlm_model.set_ref("tok2vec", tok2vec)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
deleted file mode 100644
index cbe977cad..000000000
--- a/spacy/cli/train.py
+++ /dev/null
@@ -1,773 +0,0 @@
-import os
-import tqdm
-from pathlib import Path
-from thinc.api import use_ops
-from timeit import default_timer as timer
-import shutil
-import srsly
-from wasabi import msg
-import contextlib
-import random
-
-from ..util import create_default_optimizer
-from ..util import use_gpu as set_gpu
-from ..gold import GoldCorpus
-from ..lookups import Lookups
-from .. import util
-from .. import about
-
-
-def train(
-    # fmt: off
-    lang: ("Model language", "positional", None, str),
-    output_path: ("Output directory to store model in", "positional", None, Path),
-    train_path: ("Location of JSON-formatted training data", "positional", None, Path),
-    dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
-    raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
-    base_model: ("Name of model to update (optional)", "option", "b", str) = None,
-    pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
-    vectors: ("Model to load vectors from", "option", "v", str) = None,
-    replace_components: ("Replace components from base model", "flag", "R", bool) = False,
-    n_iter: ("Number of iterations", "option", "n", int) = 30,
-    n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
-    n_examples: ("Number of examples", "option", "ns", int) = 0,
-    use_gpu: ("Use GPU", "option", "g", int) = -1,
-    version: ("Model version", "option", "V", str) = "0.0.0",
-    meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
-    init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
-    parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
-    entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
-    noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
-    orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
-    eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
-    gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
-    learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
-    textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
-    textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
-    textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
-    tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
-    omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
-    verbose: ("Display more information for debug", "flag", "VV", bool) = False,
-    debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
-    # fmt: on
-):
-    """
-    Train or update a spaCy model. Requires data to be formatted in spaCy's
-    JSON format. To convert data from other formats, use the `spacy convert`
-    command.
-    """
-    util.fix_random_seed()
-    util.set_env_log(verbose)
-
-    # Make sure all files and paths exists if they are needed
-    train_path = util.ensure_path(train_path)
-    dev_path = util.ensure_path(dev_path)
-    meta_path = util.ensure_path(meta_path)
-    output_path = util.ensure_path(output_path)
-    if raw_text is not None:
-        raw_text = list(srsly.read_jsonl(raw_text))
-    if not train_path or not train_path.exists():
-        msg.fail("Training data not found", train_path, exits=1)
-    if not dev_path or not dev_path.exists():
-        msg.fail("Development data not found", dev_path, exits=1)
-    if meta_path is not None and not meta_path.exists():
-        msg.fail("Can't find model meta.json", meta_path, exits=1)
-    meta = srsly.read_json(meta_path) if meta_path else {}
-    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
-        msg.warn(
-            "Output directory is not empty",
-            "This can lead to unintended side effects when saving the model. "
-            "Please use an empty directory or a different path instead. If "
-            "the specified output path doesn't exist, the directory will be "
-            "created for you.",
-        )
-    if not output_path.exists():
-        output_path.mkdir()
-        msg.good(f"Created output directory: {output_path}")
-
-    tag_map = {}
-    if tag_map_path is not None:
-        tag_map = srsly.read_json(tag_map_path)
-    # Take dropout and batch size as generators of values -- dropout
-    # starts high and decays sharply, to force the optimizer to explore.
-    # Batch size starts at 1 and grows, so that we make updates quickly
-    # at the beginning of training.
-    dropout_rates = util.decaying(
-        util.env_opt("dropout_from", 0.2),
-        util.env_opt("dropout_to", 0.2),
-        util.env_opt("dropout_decay", 0.0),
-    )
-    batch_sizes = util.compounding(
-        util.env_opt("batch_from", 100.0),
-        util.env_opt("batch_to", 1000.0),
-        util.env_opt("batch_compound", 1.001),
-    )
-
-    if not eval_beam_widths:
-        eval_beam_widths = [1]
-    else:
-        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
-        if 1 not in eval_beam_widths:
-            eval_beam_widths.append(1)
-        eval_beam_widths.sort()
-    has_beam_widths = eval_beam_widths != [1]
-
-    default_dir = Path(__file__).parent.parent / "pipeline" / "defaults"
-
-    # Set up the base model and pipeline. If a base model is specified, load
-    # the model and make sure the pipeline matches the pipeline setting. If
-    # training starts from a blank model, intitalize the language class.
-    pipeline = [p.strip() for p in pipeline.split(",")]
-    msg.text(f"Training pipeline: {pipeline}")
-    disabled_pipes = None
-    pipes_added = False
-    if use_gpu >= 0:
-        activated_gpu = None
-        try:
-            activated_gpu = set_gpu(use_gpu)
-        except Exception as e:
-            msg.warn(f"Exception: {e}")
-        if activated_gpu is not None:
-            msg.text(f"Using GPU: {use_gpu}")
-        else:
-            msg.warn(f"Unable to activate GPU: {use_gpu}")
-            msg.text("Using CPU only")
-            use_gpu = -1
-    if base_model:
-        msg.text(f"Starting with base model '{base_model}'")
-        nlp = util.load_model(base_model)
-        if nlp.lang != lang:
-            msg.fail(
-                f"Model language ('{nlp.lang}') doesn't match language "
-                f"specified as `lang` argument ('{lang}') ",
-                exits=1,
-            )
-        if vectors:
-            msg.text(f"Loading vectors from model '{vectors}'")
-            _load_vectors(nlp, vectors)
-
-        nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline])
-        for pipe in pipeline:
-            # first, create the model.
-            # Bit of a hack after the refactor to get the vectors into a default config
-            # use train-from-config instead :-)
-            if pipe == "parser":
-                config_loc = default_dir / "parser_defaults.cfg"
-            elif pipe == "tagger":
-                config_loc = default_dir / "tagger_defaults.cfg"
-            elif pipe == "ner":
-                config_loc = default_dir / "ner_defaults.cfg"
-            elif pipe == "textcat":
-                config_loc = default_dir / "textcat_defaults.cfg"
-            elif pipe == "senter":
-                config_loc = default_dir / "senter_defaults.cfg"
-            else:
-                raise ValueError(f"Component {pipe} currently not supported.")
-            pipe_cfg = util.load_config(config_loc, create_objects=False)
-            if vectors:
-                pretrained_config = {
-                    "@architectures": "spacy.VocabVectors.v1",
-                    "name": vectors,
-                }
-                pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
-
-            if pipe == "parser":
-                pipe_cfg["learn_tokens"] = learn_tokens
-            elif pipe == "textcat":
-                pipe_cfg["exclusive_classes"] = not textcat_multilabel
-                pipe_cfg["architecture"] = textcat_arch
-                pipe_cfg["positive_label"] = textcat_positive_label
-
-            if pipe not in nlp.pipe_names:
-                msg.text(f"Adding component to base model '{pipe}'")
-                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
-                pipes_added = True
-            elif replace_components:
-                msg.text(f"Replacing component from base model '{pipe}'")
-                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
-                pipes_added = True
-            else:
-                if pipe == "textcat":
-                    textcat_cfg = nlp.get_pipe("textcat").cfg
-                    base_cfg = {
-                        "exclusive_classes": textcat_cfg["exclusive_classes"],
-                        "architecture": textcat_cfg["architecture"],
-                        "positive_label": textcat_cfg["positive_label"],
-                    }
-                    if base_cfg != pipe_cfg:
-                        msg.fail(
-                            f"The base textcat model configuration does"
-                            f"not match the provided training options. "
-                            f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
-                            exits=1,
-                        )
-                msg.text(f"Extending component from base model '{pipe}'")
-        disabled_pipes = nlp.select_pipes(
-            disable=[p for p in nlp.pipe_names if p not in pipeline]
-        )
-    else:
-        msg.text(f"Starting with blank model '{lang}'")
-        lang_cls = util.get_lang_class(lang)
-        nlp = lang_cls()
-
-        if vectors:
-            msg.text(f"Loading vectors from model '{vectors}'")
-            _load_vectors(nlp, vectors)
-
-        for pipe in pipeline:
-            # first, create the model.
-            # Bit of a hack after the refactor to get the vectors into a default config
-            # use train-from-config instead :-)
-            if pipe == "parser":
-                config_loc = default_dir / "parser_defaults.cfg"
-            elif pipe == "tagger":
-                config_loc = default_dir / "tagger_defaults.cfg"
-            elif pipe == "morphologizer":
-                config_loc = default_dir / "morphologizer_defaults.cfg"
-            elif pipe == "ner":
-                config_loc = default_dir / "ner_defaults.cfg"
-            elif pipe == "textcat":
-                config_loc = default_dir / "textcat_defaults.cfg"
-            elif pipe == "senter":
-                config_loc = default_dir / "senter_defaults.cfg"
-            else:
-                raise ValueError(f"Component {pipe} currently not supported.")
-            pipe_cfg = util.load_config(config_loc, create_objects=False)
-            if vectors:
-                pretrained_config = {
-                    "@architectures": "spacy.VocabVectors.v1",
-                    "name": vectors,
-                }
-                pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
-
-            if pipe == "parser":
-                pipe_cfg["learn_tokens"] = learn_tokens
-            elif pipe == "textcat":
-                pipe_cfg["exclusive_classes"] = not textcat_multilabel
-                pipe_cfg["architecture"] = textcat_arch
-                pipe_cfg["positive_label"] = textcat_positive_label
-
-            pipe = nlp.create_pipe(pipe, config=pipe_cfg)
-            nlp.add_pipe(pipe)
-
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
-
-    # Create empty extra lexeme tables so the data from spacy-lookups-data
-    # isn't loaded if these features are accessed
-    if omit_extra_lookups:
-        nlp.vocab.lookups_extra = Lookups()
-        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
-        nlp.vocab.lookups_extra.add_table("lexeme_prob")
-        nlp.vocab.lookups_extra.add_table("lexeme_settings")
-
-    if vectors:
-        msg.text("Loading vector from model '{}'".format(vectors))
-        _load_vectors(nlp, vectors)
-
-    # Multitask objectives
-    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
-    for pipe_name, multitasks in multitask_options:
-        if multitasks:
-            if pipe_name not in pipeline:
-                msg.fail(
-                    f"Can't use multitask objective without '{pipe_name}' in "
-                    f"the pipeline"
-                )
-            pipe = nlp.get_pipe(pipe_name)
-            for objective in multitasks.split(","):
-                pipe.add_multitask_objective(objective)
-
-    # Prepare training corpus
-    msg.text(f"Counting training words (limit={n_examples})")
-    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
-    n_train_words = corpus.count_train()
-
-    if base_model and not pipes_added:
-        # Start with an existing model, use default optimizer
-        optimizer = create_default_optimizer()
-    else:
-        # Start with a blank model, call begin_training
-        cfg = {"device": use_gpu}
-        optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
-    nlp._optimizer = None
-
-    # Load in pretrained weights (TODO: this may be broken in the config rewrite)
-    if init_tok2vec is not None:
-        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
-        msg.text(f"Loaded pretrained tok2vec for: {components}")
-
-    # Verify textcat config
-    if "textcat" in pipeline:
-        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
-        if textcat_positive_label and textcat_positive_label not in textcat_labels:
-            msg.fail(
-                f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
-                f"does not match any label in the training data.",
-                exits=1,
-            )
-        if textcat_positive_label and len(textcat_labels) != 2:
-            msg.fail(
-                "A textcat_positive_label (tpl) '{textcat_positive_label}' was "
-                "provided for training data that does not appear to be a "
-                "binary classification problem with two labels.",
-                exits=1,
-            )
-        train_data = corpus.train_data(
-            nlp,
-            noise_level=noise_level,
-            gold_preproc=gold_preproc,
-            max_length=0,
-            ignore_misaligned=True,
-        )
-        train_labels = set()
-        if textcat_multilabel:
-            multilabel_found = False
-            for ex in train_data:
-                train_labels.update(ex.gold.cats.keys())
-                if list(ex.gold.cats.values()).count(1.0) != 1:
-                    multilabel_found = True
-            if not multilabel_found and not base_model:
-                msg.warn(
-                    "The textcat training instances look like they have "
-                    "mutually-exclusive classes. Remove the flag "
-                    "'--textcat-multilabel' to train a classifier with "
-                    "mutually-exclusive classes."
-                )
-        if not textcat_multilabel:
-            for ex in train_data:
-                train_labels.update(ex.gold.cats.keys())
-                if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
-                    msg.warn(
-                        "Some textcat training instances do not have exactly "
-                        "one positive label. Modifying training options to "
-                        "include the flag '--textcat-multilabel' for classes "
-                        "that are not mutually exclusive."
-                    )
-                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
-                    textcat_multilabel = True
-                    break
-        if base_model and set(textcat_labels) != train_labels:
-            msg.fail(
-                f"Cannot extend textcat model using data with different "
-                f"labels. Base model labels: {textcat_labels}, training data "
-                f"labels: {list(train_labels)}",
-                exits=1,
-            )
-        if textcat_multilabel:
-            msg.text(
-                f"Textcat evaluation score: ROC AUC score macro-averaged across "
-                f"the labels '{', '.join(textcat_labels)}'"
-            )
-        elif textcat_positive_label and len(textcat_labels) == 2:
-            msg.text(
-                f"Textcat evaluation score: F1-score for the "
-                f"label '{textcat_positive_label}'"
-            )
-        elif len(textcat_labels) > 1:
-            if len(textcat_labels) == 2:
-                msg.warn(
-                    "If the textcat component is a binary classifier with "
-                    "exclusive classes, provide '--textcat-positive-label' for "
-                    "an evaluation on the positive class."
-                )
-            msg.text(
-                f"Textcat evaluation score: F1-score macro-averaged across "
-                f"the labels '{', '.join(textcat_labels)}'"
-            )
-        else:
-            msg.fail(
-                "Unsupported textcat configuration. Use `spacy debug-data` "
-                "for more information."
-            )
-
-    # fmt: off
-    row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
-    row_widths = [len(w) for w in row_head]
-    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
-    # fmt: on
-    print("")
-    msg.row(row_head, **row_settings)
-    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
-    try:
-        iter_since_best = 0
-        best_score = 0.0
-        for i in range(n_iter):
-            train_data = corpus.train_dataset(
-                nlp,
-                noise_level=noise_level,
-                orth_variant_level=orth_variant_level,
-                gold_preproc=gold_preproc,
-                max_length=0,
-                ignore_misaligned=True,
-            )
-            if raw_text:
-                random.shuffle(raw_text)
-                raw_batches = util.minibatch(
-                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
-                )
-            words_seen = 0
-            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
-                losses = {}
-                for batch in util.minibatch_by_words(train_data, size=batch_sizes):
-                    if not batch:
-                        continue
-                    try:
-                        nlp.update(
-                            batch,
-                            sgd=optimizer,
-                            drop=next(dropout_rates),
-                            losses=losses,
-                        )
-                    except ValueError as e:
-                        err = "Error during training"
-                        if init_tok2vec:
-                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
-                        msg.fail(err, f"Original error message: {e}", exits=1)
-                    if raw_text:
-                        # If raw text is available, perform 'rehearsal' updates,
-                        # which use unlabelled data to reduce overfitting.
-                        raw_batch = list(next(raw_batches))
-                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
-                    docs = [ex.doc for ex in batch]
-                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
-                        pbar.update(sum(len(doc) for doc in docs))
-                    words_seen += sum(len(doc) for doc in docs)
-            with nlp.use_params(optimizer.averages):
-                util.set_env_log(False)
-                epoch_model_path = output_path / f"model{i}"
-                nlp.to_disk(epoch_model_path)
-                nlp_loaded = util.load_model_from_path(epoch_model_path)
-                for beam_width in eval_beam_widths:
-                    for name, component in nlp_loaded.pipeline:
-                        if hasattr(component, "cfg"):
-                            component.cfg["beam_width"] = beam_width
-                    dev_dataset = list(
-                        corpus.dev_dataset(
-                            nlp_loaded,
-                            gold_preproc=gold_preproc,
-                            ignore_misaligned=True,
-                        )
-                    )
-                    nwords = sum(len(ex.doc) for ex in dev_dataset)
-                    start_time = timer()
-                    scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
-                    end_time = timer()
-                    if use_gpu < 0:
-                        gpu_wps = None
-                        cpu_wps = nwords / (end_time - start_time)
-                    else:
-                        gpu_wps = nwords / (end_time - start_time)
-                        # Evaluate on CPU in the first iteration only (for
-                        # timing) when GPU is enabled
-                        if i == 0:
-                            with use_ops("numpy"):
-                                nlp_loaded = util.load_model_from_path(epoch_model_path)
-                                for name, component in nlp_loaded.pipeline:
-                                    if hasattr(component, "cfg"):
-                                        component.cfg["beam_width"] = beam_width
-                                dev_dataset = list(
-                                    corpus.dev_dataset(
-                                        nlp_loaded,
-                                        gold_preproc=gold_preproc,
-                                        ignore_misaligned=True,
-                                    )
-                                )
-                                start_time = timer()
-                                scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
-                                end_time = timer()
-                                cpu_wps = nwords / (end_time - start_time)
-                    acc_loc = output_path / f"model{i}" / "accuracy.json"
-                    srsly.write_json(acc_loc, scorer.scores)
-
-                    # Update model meta.json
-                    meta["lang"] = nlp.lang
-                    meta["pipeline"] = nlp.pipe_names
-                    if beam_width == 1:
-                        meta["speed"] = {
-                            "nwords": nwords,
-                            "cpu": cpu_wps,
-                            "gpu": gpu_wps,
-                        }
-                        meta.setdefault("accuracy", {})
-                        for component in nlp.pipe_names:
-                            for metric in _get_metrics(component):
-                                meta["accuracy"][metric] = scorer.scores[metric]
-                    else:
-                        meta.setdefault("beam_accuracy", {})
-                        meta.setdefault("beam_speed", {})
-                        for component in nlp.pipe_names:
-                            for metric in _get_metrics(component):
-                                meta["beam_accuracy"][metric] = scorer.scores[metric]
-                        meta["beam_speed"][beam_width] = {
-                            "nwords": nwords,
-                            "cpu": cpu_wps,
-                            "gpu": gpu_wps,
-                        }
-                    meta["vectors"] = {
-                        "width": nlp.vocab.vectors_length,
-                        "vectors": len(nlp.vocab.vectors),
-                        "keys": nlp.vocab.vectors.n_keys,
-                        "name": nlp.vocab.vectors.name,
-                    }
-                    meta.setdefault("name", f"model{i}")
-                    meta.setdefault("version", version)
-                    meta["labels"] = nlp.meta["labels"]
-                    meta_loc = output_path / f"model{i}" / "meta.json"
-                    srsly.write_json(meta_loc, meta)
-                    util.set_env_log(verbose)
-
-                    progress = _get_progress(
-                        i,
-                        losses,
-                        scorer.scores,
-                        output_stats,
-                        beam_width=beam_width if has_beam_widths else None,
-                        cpu_wps=cpu_wps,
-                        gpu_wps=gpu_wps,
-                    )
-                    if i == 0 and "textcat" in pipeline:
-                        textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
-                        for cat, cat_score in textcats_per_cat.items():
-                            if cat_score.get("roc_auc_score", 0) < 0:
-                                msg.warn(
-                                    f"Textcat ROC AUC score is undefined due to "
-                                    f"only one value in label '{cat}'."
-                                )
-                    msg.row(progress, **row_settings)
-                # Early stopping
-                if n_early_stopping is not None:
-                    current_score = _score_for_model(meta)
-                    if current_score < best_score:
-                        iter_since_best += 1
-                    else:
-                        iter_since_best = 0
-                        best_score = current_score
-                    if iter_since_best >= n_early_stopping:
-                        msg.text(
-                            f"Early stopping, best iteration is: {i - iter_since_best}"
-                        )
-                        msg.text(
-                            f"Best score = {best_score}; Final iteration score = {current_score}"
-                        )
-                        break
-    except Exception as e:
-        msg.warn(f"Aborting and saving final best model. Encountered exception: {e}", exits=1)
-    finally:
-        best_pipes = nlp.pipe_names
-        if disabled_pipes:
-            disabled_pipes.restore()
-        with nlp.use_params(optimizer.averages):
-            final_model_path = output_path / "model-final"
-            nlp.to_disk(final_model_path)
-            meta_loc = output_path / "model-final" / "meta.json"
-            final_meta = srsly.read_json(meta_loc)
-            final_meta.setdefault("accuracy", {})
-            final_meta["accuracy"].update(meta.get("accuracy", {}))
-            final_meta.setdefault("speed", {})
-            final_meta["speed"].setdefault("cpu", None)
-            final_meta["speed"].setdefault("gpu", None)
-            meta.setdefault("speed", {})
-            meta["speed"].setdefault("cpu", None)
-            meta["speed"].setdefault("gpu", None)
-            # combine cpu and gpu speeds with the base model speeds
-            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
-                speed = _get_total_speed(
-                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
-                )
-                final_meta["speed"]["cpu"] = speed
-            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
-                speed = _get_total_speed(
-                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
-                )
-                final_meta["speed"]["gpu"] = speed
-            # if there were no speeds to update, overwrite with meta
-            if (
-                final_meta["speed"]["cpu"] is None
-                and final_meta["speed"]["gpu"] is None
-            ):
-                final_meta["speed"].update(meta["speed"])
-            # note: beam speeds are not combined with the base model
-            if has_beam_widths:
-                final_meta.setdefault("beam_accuracy", {})
-                final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
-                final_meta.setdefault("beam_speed", {})
-                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
-            srsly.write_json(meta_loc, final_meta)
-        msg.good("Saved model to output directory", final_model_path)
-        with msg.loading("Creating best model..."):
-            best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
-        msg.good("Created best model", best_model_path)
-
-
-def _score_for_model(meta):
-    """ Returns mean score between tasks in pipeline that can be used for early stopping. """
-    mean_acc = list()
-    pipes = meta["pipeline"]
-    acc = meta["accuracy"]
-    if "tagger" in pipes:
-        mean_acc.append(acc["tags_acc"])
-    if "morphologizer" in pipes:
-        mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
-    if "parser" in pipes:
-        mean_acc.append((acc["uas"] + acc["las"]) / 2)
-    if "ner" in pipes:
-        mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
-    if "textcat" in pipes:
-        mean_acc.append(acc["textcat_score"])
-    if "senter" in pipes:
-        mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
-    return sum(mean_acc) / len(mean_acc)
-
-
-@contextlib.contextmanager
-def _create_progress_bar(total):
-    if int(os.environ.get("LOG_FRIENDLY", 0)):
-        yield
-    else:
-        pbar = tqdm.tqdm(total=total, leave=False)
-        yield pbar
-
-
-def _load_vectors(nlp, vectors):
-    util.load_model(vectors, vocab=nlp.vocab)
-
-
-def _load_pretrained_tok2vec(nlp, loc):
-    """Load pretrained weights for the 'token-to-vector' part of the component
-    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
-    """
-    with loc.open("rb") as file_:
-        weights_data = file_.read()
-    loaded = []
-    for name, component in nlp.pipeline:
-        if hasattr(component, "model") and component.model.has_ref("tok2vec"):
-            component.get_ref("tok2vec").from_bytes(weights_data)
-            loaded.append(name)
-    return loaded
-
-
-def _collate_best_model(meta, output_path, components):
-    bests = {}
-    meta.setdefault("accuracy", {})
-    for component in components:
-        bests[component] = _find_best(output_path, component)
-    best_dest = output_path / "model-best"
-    shutil.copytree(str(output_path / "model-final"), str(best_dest))
-    for component, best_component_src in bests.items():
-        shutil.rmtree(str(best_dest / component))
-        shutil.copytree(str(best_component_src / component), str(best_dest / component))
-        accs = srsly.read_json(best_component_src / "accuracy.json")
-        for metric in _get_metrics(component):
-            meta["accuracy"][metric] = accs[metric]
-    srsly.write_json(best_dest / "meta.json", meta)
-    return best_dest
-
-
-def _find_best(experiment_dir, component):
-    accuracies = []
-    for epoch_model in experiment_dir.iterdir():
-        if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
-            accs = srsly.read_json(epoch_model / "accuracy.json")
-            scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
-            # remove per_type dicts from score list for max() comparison
-            scores = [score for score in scores if isinstance(score, float)]
-            accuracies.append((scores, epoch_model))
-    if accuracies:
-        return max(accuracies)[1]
-    else:
-        return None
-
-
-def _get_metrics(component):
-    if component == "parser":
-        return ("las", "uas", "las_per_type", "sent_f", "token_acc")
-    elif component == "tagger":
-        return ("tags_acc", "token_acc")
-    elif component == "morphologizer":
-        return ("morphs_acc", "pos_acc", "token_acc")
-    elif component == "ner":
-        return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
-    elif component == "senter":
-        return ("sent_f", "sent_p", "sent_r", "token_acc")
-    elif component == "textcat":
-        return ("textcat_score", "token_acc")
-    return ("token_acc",)
-
-
-def _configure_training_output(pipeline, use_gpu, has_beam_widths):
-    row_head = ["Itn"]
-    output_stats = []
-    for pipe in pipeline:
-        if pipe == "tagger":
-            row_head.extend(["Tag Loss ", " Tag %  "])
-            output_stats.extend(["tag_loss", "tags_acc"])
-        elif pipe == "morphologizer" or pipe == "morphologizertagger":
-            row_head.extend(["Morph Loss ", " Morph %  ", " POS % "])
-            output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
-        elif pipe == "parser":
-            row_head.extend(
-                ["Dep Loss ", " UAS  ", " LAS  ", "Sent P", "Sent R", "Sent F"]
-            )
-            output_stats.extend(
-                ["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]
-            )
-        elif pipe == "ner":
-            row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
-            output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
-        elif pipe == "textcat":
-            row_head.extend(["Textcat Loss", "Textcat"])
-            output_stats.extend(["textcat_loss", "textcat_score"])
-        elif pipe == "senter":
-            row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"])
-            output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"])
-    row_head.extend(["Token %", "CPU WPS"])
-    output_stats.extend(["token_acc", "cpu_wps"])
-
-    if use_gpu >= 0:
-        row_head.extend(["GPU WPS"])
-        output_stats.extend(["gpu_wps"])
-
-    if has_beam_widths:
-        row_head.insert(1, "Beam W.")
-    # remove duplicates
-    row_head_dict = {k: 1 for k in row_head}
-    output_stats_dict = {k: 1 for k in output_stats}
-    return row_head_dict.keys(), output_stats_dict.keys()
-
-
-def _get_progress(
-    itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
-):
-    scores = {}
-    for stat in output_stats:
-        scores[stat] = 0.0
-    scores["dep_loss"] = losses.get("parser", 0.0)
-    scores["ner_loss"] = losses.get("ner", 0.0)
-    scores["tag_loss"] = losses.get("tagger", 0.0)
-    scores["morph_loss"] = losses.get("morphologizer", 0.0)
-    scores["textcat_loss"] = losses.get("textcat", 0.0)
-    scores["senter_loss"] = losses.get("senter", 0.0)
-    scores["cpu_wps"] = cpu_wps
-    scores["gpu_wps"] = gpu_wps or 0.0
-    scores.update(dev_scores)
-    formatted_scores = []
-    for stat in output_stats:
-        format_spec = "{:.3f}"
-        if stat.endswith("_wps"):
-            format_spec = "{:.0f}"
-        formatted_scores.append(format_spec.format(scores[stat]))
-    result = [itn + 1]
-    result.extend(formatted_scores)
-    if beam_width is not None:
-        result.insert(1, beam_width)
-    return result
-
-
-def _get_total_speed(speeds):
-    seconds_per_word = 0.0
-    for words_per_second in speeds:
-        if words_per_second is None:
-            return None
-        seconds_per_word += 1.0 / words_per_second
-    return 1.0 / seconds_per_word
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index a6d0a0abc..ec099b294 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -1,5 +1,7 @@
 from typing import Optional, Dict, List, Union, Sequence
 from timeit import default_timer as timer
+
+import srsly
 from pydantic import BaseModel, FilePath
 import plac
 import tqdm
@@ -11,9 +13,10 @@ from thinc.api import Model, use_pytorch_for_gpu_memory
 import random
 
 from ..gold import GoldCorpus
+from ..lookups import Lookups
 from .. import util
 from ..errors import Errors
-from ..ml import models   # don't remove - required to load the built-in architectures
+from ..ml import models  # don't remove - required to load the built-in architectures
 
 registry = util.registry
 
@@ -23,7 +26,6 @@ patience = 10
 eval_frequency = 10
 dropout = 0.2
 init_tok2vec = null
-vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
 gold_preproc = false
@@ -47,7 +49,7 @@ beta2 = 0.999
 
 [nlp]
 lang = "en"
-vectors = ${training:vectors}
+vectors = null
 
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"
@@ -93,7 +95,6 @@ class ConfigSchema(BaseModel):
         eval_frequency: int = 100
         dropout: float = 0.2
         init_tok2vec: Optional[FilePath] = None
-        vectors: Optional[str] = None
         max_epochs: int = 100
         orth_variant_level: float = 0.0
         gold_preproc: bool = False
@@ -119,9 +120,14 @@ class ConfigSchema(BaseModel):
     dev_path=("Location of JSON-formatted development data", "positional", None, Path),
     config_path=("Path to config file", "positional", None, Path),
     output_path=("Output directory to store model in", "option", "o", Path),
-    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
+    init_tok2vec=(
+    "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
+    Path),
     raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
+    verbose=("Display more information for debugging purposes", "flag", "VV", bool),
     use_gpu=("Use GPU", "option", "g", int),
+    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
+    omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
     # fmt: on
 )
 def train_cli(
@@ -129,30 +135,53 @@ def train_cli(
     dev_path,
     config_path,
     output_path=None,
-    meta_path=None,
+    init_tok2vec=None,
     raw_text=None,
-    debug=False,
     verbose=False,
     use_gpu=-1,
+    tag_map_path=None,
+    omit_extra_lookups=False,
 ):
     """
     Train or update a spaCy model. Requires data to be formatted in spaCy's
     JSON format. To convert data from other formats, use the `spacy convert`
     command.
     """
+    util.set_env_log(verbose)
+
+    # Make sure all files and paths exists if they are needed
     if not config_path or not config_path.exists():
         msg.fail("Config file not found", config_path, exits=1)
     if not train_path or not train_path.exists():
         msg.fail("Training data not found", train_path, exits=1)
     if not dev_path or not dev_path.exists():
         msg.fail("Development data not found", dev_path, exits=1)
-    if meta_path is not None and not meta_path.exists():
-        msg.fail("Can't find model meta.json", meta_path, exits=1)
     if output_path is not None and not output_path.exists():
         output_path.mkdir()
+        msg.good(f"Created output directory: {output_path}")
+    elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
+        msg.warn(
+            "Output directory is not empty.",
+            "This can lead to unintended side effects when saving the model. "
+            "Please use an empty directory or a different path instead. If "
+            "the specified output path doesn't exist, the directory will be "
+            "created for you.",
+        )
+    if raw_text is not None:
+        raw_text = list(srsly.read_jsonl(raw_text))
+    tag_map = {}
+    if tag_map_path is not None:
+        tag_map = srsly.read_json(tag_map_path)
+
+    weights_data = None
+    if init_tok2vec is not None:
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
 
     if use_gpu >= 0:
-        msg.info("Using GPU")
+        msg.info("Using GPU: {use_gpu}")
         util.use_gpu(use_gpu)
     else:
         msg.info("Using CPU")
@@ -161,13 +190,21 @@ def train_cli(
         config_path,
         {"train": train_path, "dev": dev_path},
         output_path=output_path,
-        meta_path=meta_path,
         raw_text=raw_text,
+        tag_map=tag_map,
+        weights_data=weights_data,
+        omit_extra_lookups=omit_extra_lookups,
     )
 
 
 def train(
-    config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
+    config_path,
+    data_paths,
+    raw_text=None,
+    output_path=None,
+    tag_map=None,
+    weights_data=None,
+    omit_extra_lookups=False,
 ):
     msg.info(f"Loading config from: {config_path}")
     # Read the config first without creating objects, to get to the original nlp_config
@@ -177,15 +214,104 @@ def train(
         use_pytorch_for_gpu_memory()
     nlp_config = config["nlp"]
     config = util.load_config(config_path, create_objects=True)
+    training = config["training"]
     msg.info("Creating nlp from config")
     nlp = util.load_model_from_config(nlp_config)
-    training = config["training"]
     optimizer = training["optimizer"]
     limit = training["limit"]
     msg.info("Loading training corpus")
     corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
-    msg.info("Initializing the nlp pipeline")
-    nlp.begin_training(lambda: corpus.train_examples)
+
+    # verify textcat config
+    if "textcat" in nlp_config["pipeline"]:
+        textcat_labels = set(nlp.get_pipe("textcat").labels)
+        textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
+
+        # check whether the setting 'exclusive_classes' corresponds to the provided training data
+        if textcat_multilabel:
+            multilabel_found = False
+            for ex in corpus.train_examples:
+                cats = ex.doc_annotation.cats
+                textcat_labels.update(cats.keys())
+                if list(cats.values()).count(1.0) != 1:
+                    multilabel_found = True
+            if not multilabel_found:
+                msg.warn(
+                    "The textcat training instances look like they have "
+                    "mutually exclusive classes. Set 'exclusive_classes' "
+                    "to 'true' in the config to train a classifier with "
+                    "mutually exclusive classes more accurately."
+                )
+        else:
+            for ex in corpus.train_examples:
+                cats = ex.doc_annotation.cats
+                textcat_labels.update(cats.keys())
+                if list(cats.values()).count(1.0) != 1:
+                    msg.fail(
+                        "Some textcat training instances do not have exactly "
+                        "one positive label. Set 'exclusive_classes' "
+                        "to 'false' in the config to train a classifier with classes "
+                        "that are not mutually exclusive."
+                    )
+        msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
+        nlp.get_pipe("textcat").labels = tuple(textcat_labels)
+
+        # if 'positive_label' is provided: double check whether it's in the data and the task is binary
+        if nlp_config["pipeline"]["textcat"].get("positive_label", None):
+            textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
+            pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
+            if pos_label not in textcat_labels:
+                msg.fail(
+                    f"The textcat's 'positive_label' config setting '{pos_label}' "
+                    f"does not match any label in the training data.",
+                    exits=1,
+                )
+            if len(textcat_labels) != 2:
+                msg.fail(
+                    f"A textcat 'positive_label' '{pos_label}' was "
+                    f"provided for training data that does not appear to be a "
+                    f"binary classification problem with two labels.",
+                    exits=1,
+                )
+
+    if training.get("resume", False):
+        msg.info("Resuming training")
+        nlp.resume_training()
+    else:
+        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
+        nlp.begin_training(
+            lambda: corpus.train_examples
+        )
+
+    # Update tag map with provided mapping
+    nlp.vocab.morphology.tag_map.update(tag_map)
+
+    # Create empty extra lexeme tables so the data from spacy-lookups-data
+    # isn't loaded if these features are accessed
+    if omit_extra_lookups:
+        nlp.vocab.lookups_extra = Lookups()
+        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
+        nlp.vocab.lookups_extra.add_table("lexeme_prob")
+        nlp.vocab.lookups_extra.add_table("lexeme_settings")
+
+    # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
+    if weights_data is not None:
+        tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
+        if tok2vec_path is None:
+            msg.fail(
+                f"To use a pretrained tok2vec model, the config needs to specify which "
+                f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
+                exits=1,
+            )
+        tok2vec = config
+        for subpath in tok2vec_path.split("."):
+            tok2vec = tok2vec.get(subpath)
+        if not tok2vec:
+            msg.fail(
+                f"Could not locate the tok2vec model at {tok2vec_path}.",
+                exits=1,
+            )
+        tok2vec.from_bytes(weights_data)
 
     train_batches = create_train_batches(nlp, corpus, training)
     evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
@@ -202,6 +328,7 @@ def train(
         patience=training.get("patience", 0),
         max_steps=training.get("max_steps", 0),
         eval_frequency=training["eval_frequency"],
+        raw_text=raw_text,
     )
 
     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
@@ -215,7 +342,8 @@ def train(
                 progress.close()
                 print_row(info)
                 if is_best_checkpoint and output_path is not None:
-                    nlp.to_disk(output_path)
+                    update_meta(training, nlp, info)
+                    nlp.to_disk(output_path / "model-best")
                 progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
             # Clean up the objects to faciliate garbage collection.
             for eg in batch:
@@ -223,6 +351,12 @@ def train(
                 eg.goldparse = None
                 eg.doc_annotation = None
                 eg.token_annotation = None
+    except Exception as e:
+        msg.warn(
+            f"Aborting and saving the final best model. "
+            f"Encountered exception: {str(e)}",
+            exits=1,
+        )
     finally:
         if output_path is not None:
             final_model_path = output_path / "model-final"
@@ -231,24 +365,30 @@ def train(
                     nlp.to_disk(final_model_path)
             else:
                 nlp.to_disk(final_model_path)
-            msg.good("Saved model to output directory", final_model_path)
+            msg.good(f"Saved model to output directory {final_model_path}")
 
 
 def create_train_batches(nlp, corpus, cfg):
     epochs_todo = cfg.get("max_epochs", 0)
     while True:
-        train_examples = list(corpus.train_dataset(
-            nlp,
-            noise_level=0.0,
-            orth_variant_level=cfg["orth_variant_level"],
-            gold_preproc=cfg["gold_preproc"],
-            max_length=cfg["max_length"],
-            ignore_misaligned=True,
-        ))
+        train_examples = list(
+            corpus.train_dataset(
+                nlp,
+                noise_level=cfg["noise_level"],
+                orth_variant_level=cfg["orth_variant_level"],
+                gold_preproc=cfg["gold_preproc"],
+                max_length=cfg["max_length"],
+                ignore_misaligned=True,
+            )
+        )
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
         random.shuffle(train_examples)
-        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"])
+        batches = util.minibatch_by_words(
+            train_examples,
+            size=cfg["batch_size"],
+            discard_oversize=cfg["discard_oversize"],
+        )
         # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
         try:
             first = next(batches)
@@ -273,7 +413,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
         )
         n_words = sum(len(ex.doc) for ex in dev_examples)
         start_time = timer()
-            
+
         if optimizer.averages:
             with nlp.use_params(optimizer.averages):
                 scorer = nlp.evaluate(dev_examples, batch_size=32)
@@ -284,7 +424,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
         scores = scorer.scores
         # Calculate a weighted sum based on score_weights for the main score
         weights = cfg["score_weights"]
-        weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+        try:
+            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+        except KeyError as e:
+            raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys())))
+
         scores["speed"] = wps
         return weighted_score, scores
 
@@ -292,8 +436,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 
 
 def train_while_improving(
-    nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency,
-    accumulate_gradient=1, patience=0, max_steps=0
+    nlp,
+    optimizer,
+    train_data,
+    evaluate,
+    *,
+    dropout,
+    eval_frequency,
+    accumulate_gradient=1,
+    patience=0,
+    max_steps=0,
+    raw_text=None,
 ):
     """Train until an evaluation stops improving. Works as a generator,
     with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -341,11 +494,22 @@ def train_while_improving(
     losses = {}
     to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
 
+    if raw_text:
+        random.shuffle(raw_text)
+        raw_batches = util.minibatch(
+            (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
+        )
+
     for step, batch in enumerate(train_data):
         dropout = next(dropouts)
         with nlp.select_pipes(enable=to_enable):
             for subbatch in subdivide_batch(batch, accumulate_gradient):
                 nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
+                if raw_text:
+                    # If raw text is available, perform 'rehearsal' updates,
+                    # which use unlabelled data to reduce overfitting.
+                    raw_batch = list(next(raw_batches))
+                    nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
             for name, proc in nlp.pipeline:
                 if hasattr(proc, "model"):
                     proc.model.finish_update(optimizer)
@@ -386,7 +550,7 @@ def subdivide_batch(batch, accumulate_gradient):
         if subbatch:
             yield subbatch
         start += len(subbatch)
-    subbatch = batch[start : ]
+    subbatch = batch[start:]
     if subbatch:
         yield subbatch
 
@@ -405,14 +569,34 @@ def setup_printer(training, nlp):
     msg.row(["-" * width for width in table_widths])
 
     def print_row(info):
-        losses = [
-            "{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0)))
-            for pipe_name in nlp.pipe_names
-        ]
-        scores = [
-            "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols
-        ]
-        data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
+        try:
+            losses = [
+                "{0:.2f}".format(float(info["losses"][pipe_name]))
+                for pipe_name in nlp.pipe_names
+            ]
+        except KeyError as e:
+            raise KeyError(
+                Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
+
+        try:
+            scores = [
+                "{0:.2f}".format(float(info["other_scores"][col]))
+                for col in score_cols
+            ]
+        except KeyError as e:
+            raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
+        data = (
+            [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
+        )
         msg.row(data, widths=table_widths, aligns=table_aligns)
 
     return print_row
+
+
+def update_meta(training, nlp, info):
+    score_cols = training["scores"]
+    nlp.meta["performance"] = {}
+    for metric in score_cols:
+        nlp.meta["performance"][metric] = info["other_scores"][metric]
+    for pipe_name in nlp.pipe_names:
+        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
diff --git a/spacy/errors.py b/spacy/errors.py
index 94a0218a7..d6fdd1b43 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -580,7 +580,14 @@ class Errors(object):
             "table, which contains {n_rows} vectors.")
 
     # TODO: fix numbering after merging develop into master
-
+    E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
+            "{keys}")
+    E984 = ("Could not parse the {input} - double check the data is written "
+            "in the correct format as expected by spaCy.")
+    E985 = ("The pipeline component '{component}' is already available in the base "
+            "model. The settings in the component block in the config file are "
+            "being ignored. If you want to replace this component instead, set "
+            "'replace' to True in the training configuration.")
     E986 = ("Could not create any training batches: check your input. "
             "Perhaps discard_oversize should be set to False ?")
     E987 = ("The text of an example training instance is either a Doc or "
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 1e58f0635..19b135193 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -229,6 +229,10 @@ class GoldCorpus(object):
                             if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
                                 raise ValueError(Errors.E987.format(type=type(doc)))
                             examples.append(Example.from_dict(ex_dict, doc=doc))
+                    else:
+                        raise ValueError(Errors.E984.format(input="JSONL format"))
+                else:
+                    raise ValueError(Errors.E984.format(input="JSONL format"))
 
             elif file_name.endswith("msg"):
                 text, ex_dict = srsly.read_msgpack(loc)
@@ -550,14 +554,22 @@ def json_to_examples(doc):
 def read_json_file(loc, docs_filter=None, limit=None):
     loc = util.ensure_path(loc)
     if loc.is_dir():
+        parsed = False
         for filename in loc.iterdir():
+            parsed = True
             yield from read_json_file(loc / filename, limit=limit)
+        if not parsed:
+            raise ValueError(Errors.E984.format(input="JSON directory"))
     else:
+        parsed = False
         for doc in _json_iterate(loc):
             if docs_filter is not None and not docs_filter(doc):
                 continue
             for json_data in json_to_examples(doc):
+                parsed = True
                 yield json_data
+        if not parsed:
+            raise ValueError(Errors.E984.format(input="JSON file"))
 
 
 def _json_iterate(loc):
diff --git a/spacy/language.py b/spacy/language.py
index 6341dc858..97bdd698c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -319,14 +319,14 @@ class Language(object):
         # transform the model's config to an actual Model
         factory_cfg = dict(config)
 
-        # check whether we have a proper model config, or load a default one
+        # check whether we have a proper model config, ignore if the type is wrong
         if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
             warnings.warn(
                 Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)
             )
 
         # refer to the model configuration in the cfg settings for this component
-        if "model" in factory_cfg:
+        elif "model" in factory_cfg:
             self.config[name] = {"model": factory_cfg["model"]}
 
         # create all objects in the config
@@ -1086,6 +1086,7 @@ class component(object):
         requires=tuple(),
         retokenizes=False,
         default_model=lambda: None,
+        default_config=None,
     ):
         """Decorate a pipeline component.
 
@@ -1099,6 +1100,7 @@ class component(object):
         self.requires = validate_attrs(requires)
         self.retokenizes = retokenizes
         self.default_model = default_model
+        self.default_config = default_config
 
     def __call__(self, *args, **kwargs):
         obj = args[0]
@@ -1113,9 +1115,10 @@ class component(object):
         def factory(nlp, model, **cfg):
             if model is None:
                 model = self.default_model()
-                warnings.warn(Warnings.W098.format(name=self.name))
-            if model is None:
-                warnings.warn(Warnings.W097.format(name=self.name))
+            if self.default_config:
+                for key, value in self.default_config.items():
+                    if key not in cfg:
+                        cfg[key] = value
             if hasattr(obj, "from_nlp"):
                 return obj.from_nlp(nlp, model, **cfg)
             elif isinstance(obj, type):
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 8000d1aff..4a360a9e6 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -3,26 +3,31 @@ import numpy
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 
 
-def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
+def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
+    softmax = Softmax(nO=nO, nI=token_vector_width * 2)
     model = chain(
         tok2vec,
-        Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0),
+        Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
         LayerNorm(token_vector_width * 2),
-        Softmax(nO=n_tags, nI=token_vector_width * 2),
+        softmax,
     )
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("output_layer", softmax)
     return model
 
 
-def build_cloze_multi_task_model(vocab, tok2vec):
-    output_size = vocab.vectors.data.shape[1]
+def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
+    # nO = vocab.vectors.data.shape[1]
     output_layer = chain(
         Maxout(
-            nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0
+            nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
         ),
-        Linear(nO=output_size, nI=output_size, init_W=zero_init),
+        Linear(nO=nO, nI=nO, init_W=zero_init),
     )
     model = chain(tok2vec, output_layer)
     model = build_masked_language_model(vocab, model)
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("output_layer", output_layer)
     return model
 
 
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 141c66f79..a02e1a5a1 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -31,6 +31,7 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
             model.set_ref("output_layer", linear_layer)
     model.set_ref("tok2vec", tok2vec)
     model.set_dim("nO", nO)
+    model.attrs["multi_label"] = not exclusive_classes
     return model
 
 
@@ -44,6 +45,7 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
             output_layer = softmax_activation() if exclusive_classes else Logistic()
             model = model >> with_cpu(output_layer, output_layer.ops)
     model.set_ref("output_layer", sparse_linear)
+    model.attrs["multi_label"] = not exclusive_classes
     return model
 
 
@@ -110,6 +112,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
     if model.has_dim("nO") is not False:
         model.set_dim("nO", nO)
     model.set_ref("output_layer", linear_model.get_ref("output_layer"))
+    model.attrs["multi_label"] = not exclusive_classes
     return model
 
 
diff --git a/spacy/pipeline/defaults/multitask_defaults.cfg b/spacy/pipeline/defaults/multitask_defaults.cfg
new file mode 100644
index 000000000..d3dbe9b53
--- /dev/null
+++ b/spacy/pipeline/defaults/multitask_defaults.cfg
@@ -0,0 +1,15 @@
+[model]
+@architectures = "spacy.MultiTask.v1"
+maxout_pieces = 3
+token_vector_width = 96
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 2
+subword_features = true
+dropout = null
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index a6edf00d9..75628ce3c 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -648,9 +648,10 @@ class MultitaskObjective(Tagger):
     side-objective.
     """
 
-    def __init__(self, vocab, model, target='dep_tag_offset', **cfg):
+    def __init__(self, vocab, model, **cfg):
         self.vocab = vocab
         self.model = model
+        target = cfg["target"]   # default: 'dep_tag_offset'
         if target == "dep":
             self.make_label = self.make_dep
         elif target == "tag":
@@ -668,8 +669,6 @@ class MultitaskObjective(Tagger):
         else:
             raise ValueError(Errors.E016)
         self.cfg = dict(cfg)
-        # TODO: remove - put in config
-        self.cfg.setdefault("maxout_pieces", 2)
 
     @property
     def labels(self):
@@ -682,7 +681,7 @@ class MultitaskObjective(Tagger):
     def set_annotations(self, docs, dep_ids, tensors=None):
         pass
 
-    def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None,
+    def begin_training(self, get_examples=lambda: [], pipeline=None,
                        sgd=None, **kwargs):
         gold_examples = nonproj.preprocess_training_data(get_examples())
         # for raw_text, doc_annot in gold_tuples:
@@ -808,13 +807,13 @@ class ClozeMultitask(Pipe):
         self.vocab = vocab
         self.model = model
         self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)
+        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
 
     def set_annotations(self, docs, dep_ids, tensors=None):
         pass
 
     def begin_training(self, get_examples=lambda: [], pipeline=None,
-                        tok2vec=None, sgd=None, **kwargs):
+                       sgd=None, **kwargs):
         link_vectors_to_models(self.vocab)
         self.model.initialize()
         X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
@@ -951,13 +950,13 @@ class TextCategorizer(Pipe):
             losses[self.name] += (gradient**2).sum()
 
     def _examples_to_truth(self, examples):
-        golds = [ex.gold for ex in examples]
-        truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
-        not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
-        for i, gold in enumerate(golds):
+        gold_cats = [ex.doc_annotation.cats for ex in examples]
+        truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f")
+        not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f")
+        for i, gold_cat in enumerate(gold_cats):
             for j, label in enumerate(self.labels):
-                if label in gold.cats:
-                    truths[i, j] = gold.cats[label]
+                if label in gold_cat:
+                    truths[i, j] = gold_cat[label]
                 else:
                     not_missing[i, j] = 0.
         truths = self.model.ops.asarray(truths)
@@ -1026,28 +1025,27 @@ cdef class DependencyParser(Parser):
             output.append(merge_subtokens)
         return tuple(output)
 
-    def add_multitask_objective(self, target):
-        if target == "cloze":
-            cloze = ClozeMultitask(self.vocab)
-            self._multitasks.append(cloze)
-        else:
-            labeller = MultitaskObjective(self.vocab, target=target)
-            self._multitasks.append(labeller)
+    def add_multitask_objective(self, mt_component):
+        self._multitasks.append(mt_component)
 
     def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
         for labeller in self._multitasks:
-            tok2vec = self.model.get_ref("tok2vec")
-            labeller.begin_training(get_examples, pipeline=pipeline,
-                                    tok2vec=tok2vec, sgd=sgd)
+            labeller.model.set_dim("nO", len(self.labels))
+            if labeller.model.has_ref("output_layer"):
+                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
+            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
 
     def __reduce__(self):
-        return (DependencyParser, (self.vocab, self.model), self.moves)
+        return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg))
 
     def __getstate__(self):
-        return self.moves
+        return (self.moves, self.cfg)
 
-    def __setstate__(self, moves):
+    def __setstate__(self, state):
+        moves, config = state
         self.moves = moves
+        self.cfg = config
 
     @property
     def labels(self):
@@ -1073,28 +1071,27 @@ cdef class EntityRecognizer(Parser):
     requires = []
     TransitionSystem = BiluoPushDown
 
-    def add_multitask_objective(self, target):
-        if target == "cloze":
-            cloze = ClozeMultitask(self.vocab)
-            self._multitasks.append(cloze)
-        else:
-            labeller = MultitaskObjective(self.vocab, target=target)
-            self._multitasks.append(labeller)
+    def add_multitask_objective(self, mt_component):
+        self._multitasks.append(mt_component)
 
     def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
         for labeller in self._multitasks:
-            tok2vec = self.model.get_ref("tok2vec")
-            labeller.begin_training(get_examples, pipeline=pipeline,
-                                    tok2vec=tok2vec)
+            labeller.model.set_dim("nO", len(self.labels))
+            if labeller.model.has_ref("output_layer"):
+                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
+            labeller.begin_training(get_examples, pipeline=pipeline)
 
     def __reduce__(self):
-        return (EntityRecognizer, (self.vocab, self.model), self.moves)
+        return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg))
 
     def __getstate__(self):
-        return self.moves
+        return self.moves, self.cfg
 
-    def __setstate__(self, moves):
+    def __setstate__(self, state):
+        moves, config = state
         self.moves = moves
+        self.cfg = config
 
     @property
     def labels(self):
@@ -1565,15 +1562,23 @@ Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, mod
 Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
 
 def parser_factory(nlp, model, **cfg):
+    default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
     if model is None:
         model = default_parser()
         warnings.warn(Warnings.W098.format(name="parser"))
+    for key, value in default_config.items():
+        if key not in cfg:
+            cfg[key] = value
     return DependencyParser.from_nlp(nlp, model, **cfg)
 
 def ner_factory(nlp, model, **cfg):
+    default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
     if model is None:
         model = default_ner()
         warnings.warn(Warnings.W098.format(name="ner"))
+    for key, value in default_config.items():
+        if key not in cfg:
+            cfg[key] = value
     return EntityRecognizer.from_nlp(nlp, model, **cfg)
 
 __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 5882fa266..de30a55f0 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -172,7 +172,7 @@ class Tok2VecListener(Model):
 
     def verify_inputs(self, inputs):
         if self._batch_id is None and self._outputs is None:
-            raise ValueError
+            raise ValueError("The Tok2Vec listener did not receive valid input.")
         else:
             batch_id = self.get_batch_id(inputs)
             if batch_id != self._batch_id:
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 7e2466be7..288da23aa 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -88,24 +88,20 @@ class Scorer(object):
         self.ner = PRFScore()
         self.ner_per_ents = dict()
         self.eval_punct = eval_punct
-        self.textcat = None
-        self.textcat_per_cat = dict()
+        self.textcat = PRFScore()
+        self.textcat_f_per_cat = dict()
+        self.textcat_auc_per_cat = dict()
         self.textcat_positive_label = None
         self.textcat_multilabel = False
 
         if pipeline:
-            for name, model in pipeline:
+            for name, component in pipeline:
                 if name == "textcat":
-                    self.textcat_positive_label = model.cfg.get("positive_label", None)
-                    if self.textcat_positive_label:
-                        self.textcat = PRFScore()
-                    if not model.cfg.get("exclusive_classes", False):
-                        self.textcat_multilabel = True
-                        for label in model.cfg.get("labels", []):
-                            self.textcat_per_cat[label] = ROCAUCScore()
-                    else:
-                        for label in model.cfg.get("labels", []):
-                            self.textcat_per_cat[label] = PRFScore()
+                    self.textcat_multilabel = component.model.attrs["multi_label"]
+                    self.textcat_positive_label = component.cfg.get("positive_label", None)
+                    for label in component.cfg.get("labels", []):
+                        self.textcat_auc_per_cat[label] = ROCAUCScore()
+                        self.textcat_f_per_cat[label] = PRFScore()
 
     @property
     def tags_acc(self):
@@ -207,46 +203,52 @@ class Scorer(object):
         }
 
     @property
-    def textcat_score(self):
-        """RETURNS (float): f-score on positive label for binary exclusive,
-        macro-averaged f-score for 3+ exclusive,
-        macro-averaged AUC ROC score for multilabel (-1 if undefined)
+    def textcat_f(self):
+        """RETURNS (float): f-score on positive label for binary classification,
+        macro-averaged f-score for multilabel classification
         """
         if not self.textcat_multilabel:
-            # binary multiclass
             if self.textcat_positive_label:
+                # binary classification
                 return self.textcat.fscore * 100
-            # other multiclass
-            return (
-                sum([score.fscore for label, score in self.textcat_per_cat.items()])
-                / (len(self.textcat_per_cat) + 1e-100)
-                * 100
-            )
-        # multilabel
+        # multi-class and/or multi-label
+        return (
+            sum([score.fscore for label, score in self.textcat_f_per_cat.items()])
+            / (len(self.textcat_f_per_cat) + 1e-100)
+            * 100
+        )
+
+    @property
+    def textcat_auc(self):
+        """RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined)
+        """
         return max(
-            sum([score.score for label, score in self.textcat_per_cat.items()])
-            / (len(self.textcat_per_cat) + 1e-100),
+            sum([score.score for label, score in self.textcat_auc_per_cat.items()])
+            / (len(self.textcat_auc_per_cat) + 1e-100),
             -1,
         )
 
     @property
-    def textcats_per_cat(self):
-        """RETURNS (dict): Scores per textcat label.
+    def textcats_auc_per_cat(self):
+        """RETURNS (dict): AUC ROC Scores per textcat label.
         """
-        if not self.textcat_multilabel:
-            return {
-                k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-                for k, v in self.textcat_per_cat.items()
-            }
         return {
             k: {"roc_auc_score": max(v.score, -1)}
-            for k, v in self.textcat_per_cat.items()
+            for k, v in self.textcat_auc_per_cat.items()
+        }
+
+    @property
+    def textcats_f_per_cat(self):
+        """RETURNS (dict): F-scores per textcat label.
+        """
+        return {
+            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
+            for k, v in self.textcat_f_per_cat.items()
         }
 
     @property
     def scores(self):
-        """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`,
-            `ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`.
+        """RETURNS (dict): All scores mapped by key.
         """
         return {
             "uas": self.uas,
@@ -264,8 +266,10 @@ class Scorer(object):
             "sent_r": self.sent_r,
             "sent_f": self.sent_f,
             "token_acc": self.token_acc,
-            "textcat_score": self.textcat_score,
-            "textcats_per_cat": self.textcats_per_cat,
+            "textcat_f": self.textcat_f,
+            "textcat_auc": self.textcat_auc,
+            "textcats_f_per_cat": self.textcats_f_per_cat,
+            "textcats_auc_per_cat": self.textcats_auc_per_cat,
         }
 
     def score(self, example, verbose=False, punct_labels=("p", "punct")):
@@ -408,7 +412,7 @@ class Scorer(object):
         )
         if (
             len(gold.cats) > 0
-            and set(self.textcat_per_cat) == set(gold.cats)
+            and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats)
             and set(gold.cats) == set(doc.cats)
         ):
             goldcat = max(gold.cats, key=gold.cats.get)
@@ -418,17 +422,21 @@ class Scorer(object):
                     set([self.textcat_positive_label]) & set([candcat]),
                     set([self.textcat_positive_label]) & set([goldcat]),
                 )
-            for label in self.textcat_per_cat:
-                if self.textcat_multilabel:
-                    self.textcat_per_cat[label].score_set(
+            for label in set(gold.cats):
+                self.textcat_auc_per_cat[label].score_set(
                         doc.cats[label], gold.cats[label]
-                    )
-                else:
-                    self.textcat_per_cat[label].score_set(
+                )
+                self.textcat_f_per_cat[label].score_set(
                         set([label]) & set([candcat]), set([label]) & set([goldcat])
-                    )
-        elif len(self.textcat_per_cat) > 0:
-            model_labels = set(self.textcat_per_cat)
+                )
+        elif len(self.textcat_f_per_cat) > 0:
+            model_labels = set(self.textcat_f_per_cat)
+            eval_labels = set(gold.cats)
+            raise ValueError(
+                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
+            )
+        elif len(self.textcat_auc_per_cat) > 0:
+            model_labels = set(self.textcat_auc_per_cat)
             eval_labels = set(gold.cats)
             raise ValueError(
                 Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index fcaff444e..7bd9562e2 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -63,15 +63,14 @@ cdef class Parser:
             # defined by EntityRecognizer as a BiluoPushDown
             moves = self.TransitionSystem(self.vocab.strings)
         self.moves = moves
-        cfg.setdefault('min_action_freq', 30)
-        cfg.setdefault('learn_tokens', False)
-        cfg.setdefault('beam_width', 1)
-        cfg.setdefault('beam_update_prob', 1.0)  # or 0.5 (both defaults were previously used)
         self.model = model
         if self.moves.n_moves != 0:
             self.set_output(self.moves.n_moves)
         self.cfg = cfg
         self._multitasks = []
+        for multitask in cfg.get("multitasks", []):
+            self.add_multitask_objective(multitask)
+
         self._rehearsal_model = None
 
     @classmethod
@@ -79,13 +78,15 @@ cdef class Parser:
         return cls(nlp.vocab, model, **cfg)
 
     def __reduce__(self):
-        return (Parser, (self.vocab, self.model), self.moves)
+        return (Parser, (self.vocab, self.model), (self.moves, self.cfg))
 
     def __getstate__(self):
-        return self.moves
+        return (self.moves, self.cfg)
 
-    def __setstate__(self, moves):
+    def __setstate__(self, state):
+        moves, config = state
         self.moves = moves
+        self.cfg = config
 
     @property
     def move_names(self):
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index c92fc1ff9..879334056 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -9,7 +9,8 @@ from spacy.pipeline.defaults import default_ner
 def test_doc_add_entities_set_ents_iob(en_vocab):
     text = ["This", "is", "a", "lion"]
     doc = get_doc(en_vocab, text)
-    ner = EntityRecognizer(en_vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner = EntityRecognizer(en_vocab, default_ner(), **config)
     ner.begin_training([])
     ner(doc)
     assert len(list(doc.ents)) == 0
@@ -25,7 +26,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
 def test_ents_reset(en_vocab):
     text = ["This", "is", "a", "lion"]
     doc = get_doc(en_vocab, text)
-    ner = EntityRecognizer(en_vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner = EntityRecognizer(en_vocab, default_ner(), **config)
     ner.begin_training([])
     ner(doc)
     assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index ee1bba886..f9663ba32 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -17,7 +17,8 @@ def vocab():
 
 @pytest.fixture
 def parser(vocab):
-    parser = DependencyParser(vocab, default_parser())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    parser = DependencyParser(vocab, default_parser(), **config)
     return parser
 
 
@@ -57,12 +58,13 @@ def test_add_label(parser):
 
 
 def test_add_label_deserializes_correctly():
-    ner1 = EntityRecognizer(Vocab(), default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
     ner1.add_label("C")
     ner1.add_label("B")
     ner1.add_label("A")
     ner1.begin_training([])
-    ner2 = EntityRecognizer(Vocab(), default_ner())
+    ner2 = EntityRecognizer(Vocab(), default_ner(), **config)
 
     # the second model needs to be resized before we can call from_bytes
     ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 30b4a6f6d..5d265261f 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -138,7 +138,8 @@ def test_get_oracle_actions():
         deps.append(dep)
         ents.append(ent)
     doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
-    parser = DependencyParser(doc.vocab, default_parser())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    parser = DependencyParser(doc.vocab, default_parser(), **config)
     parser.moves.add_action(0, "")
     parser.moves.add_action(1, "")
     parser.moves.add_action(1, "")
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 8e41a16c0..b0a8109dc 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -138,7 +138,8 @@ def test_accept_blocked_token():
     # 1. test normal behaviour
     nlp1 = English()
     doc1 = nlp1("I live in New York")
-    ner1 = EntityRecognizer(doc1.vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
     assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
     assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
 
@@ -156,7 +157,8 @@ def test_accept_blocked_token():
     # 2. test blocking behaviour
     nlp2 = English()
     doc2 = nlp2("I live in New York")
-    ner2 = EntityRecognizer(doc2.vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
 
     # set "New York" to a blocked entity
     doc2.ents = [(0, 3, 5)]
@@ -213,7 +215,8 @@ def test_overwrite_token():
     assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
 
     # Check that a new ner can overwrite O
-    ner2 = EntityRecognizer(doc.vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
     ner2.moves.add_action(5, "")
     ner2.add_label("GPE")
     state = ner2.moves.init_batch([doc])[0]
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index b648e9a00..7f3e981ea 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -28,7 +28,8 @@ def tok2vec():
 
 @pytest.fixture
 def parser(vocab, arc_eager):
-    return Parser(vocab, model=default_parser(), moves=arc_eager)
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
 
 
 @pytest.fixture
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index db9eb5e6f..fa5d59f9e 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -94,7 +94,8 @@ def test_beam_advance_too_few_scores(beam, scores):
 
 def test_beam_parse():
     nlp = Language()
-    nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser")
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
     nlp.parser.add_label("nsubj")
     nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
     doc = nlp.make_doc("Australia is a country")
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index dc13fcdf1..ccf7d3ba3 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -16,7 +16,8 @@ def vocab():
 
 @pytest.fixture
 def parser(vocab):
-    parser = DependencyParser(vocab, default_parser())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    parser = DependencyParser(vocab, default_parser(), **config)
     parser.cfg["token_vector_width"] = 4
     parser.cfg["hidden_width"] = 32
     # parser.add_label('right')
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 5a76697bc..177b6bb3d 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -270,7 +270,8 @@ def test_issue1963(en_tokenizer):
 
 @pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
-    ner = EntityRecognizer(Vocab(), default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner = EntityRecognizer(Vocab(), default_ner(), **config)
     example = Example(doc=None)
     example.set_token_annotation(
         ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 9ff118a1f..6df437b3c 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -196,7 +196,8 @@ def test_issue3345():
     doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
     doc[4].is_sent_start = True
     ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
-    ner = EntityRecognizer(doc.vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner = EntityRecognizer(doc.vocab, default_ner(), **config)
     # Add the OUT action. I wouldn't have thought this would be necessary...
     ner.moves.add_action(5, "")
     ner.add_label("GPE")
diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py
index 3d8e80847..15632bdf8 100644
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@@ -6,7 +6,8 @@ from spacy.pipeline.defaults import default_parser
 
 def test_issue3830_no_subtok():
     """Test that the parser doesn't have subtok label if not learn_tokens"""
-    parser = DependencyParser(Vocab(), default_parser())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    parser = DependencyParser(Vocab(), default_parser(), **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
     parser.begin_training(lambda: [])
@@ -15,7 +16,8 @@ def test_issue3830_no_subtok():
 
 def test_issue3830_with_subtok():
     """Test that the parser does have subtok label if learn_tokens=True."""
-    parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True)
+    config = {"learn_tokens": True, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    parser = DependencyParser(Vocab(), default_parser(), **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
     parser.begin_training(lambda: [])
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
index 30081543b..4978aba44 100644
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@@ -74,6 +74,7 @@ def test_issue4042_bug2():
             output_dir.mkdir()
         ner1.to_disk(output_dir)
 
-        ner2 = EntityRecognizer(vocab, default_ner())
+        config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+        ner2 = EntityRecognizer(vocab, default_ner(), **config)
         ner2.from_disk(output_dir)
         assert len(ner2.labels) == 2
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
index ba4d2deab..946316d85 100644
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@@ -12,7 +12,8 @@ def test_issue4313():
     beam_width = 16
     beam_density = 0.0001
     nlp = English()
-    ner = EntityRecognizer(nlp.vocab, default_ner())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
+    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
     ner.add_label("SOME_LABEL")
     ner.begin_training([])
     nlp.add_pipe(ner)
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
index 967db5d67..cdc3c09ca 100644
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@@ -1,12 +1,30 @@
-import pytest
+import pickle
 import numpy
 
 from spacy.lang.en import English
 from spacy.vocab import Vocab
 
+from spacy.tests.util import make_tempdir
+
+
+def test_pickle_ner():
+    """ Ensure the pickling of the NER goes well"""
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
+    with make_tempdir() as tmp_path:
+        with (tmp_path / "ner.pkl").open("wb") as file_:
+            pickle.dump(ner, file_)
+            assert ner.cfg["min_action_freq"] == 342
+
+        with (tmp_path / "ner.pkl").open("rb") as file_:
+            ner2 = pickle.load(file_)
+            assert ner2.cfg["min_action_freq"] == 342
+
 
 def test_issue4725():
     # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
     vocab = Vocab(vectors_name="test_vocab_add_vector")
     data = numpy.ndarray((5, 3), dtype="f")
     data[0] = 1.0
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 595a35a9f..9c4e1f61e 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -12,7 +12,8 @@ test_parsers = [DependencyParser, EntityRecognizer]
 
 @pytest.fixture
 def parser(en_vocab):
-    parser = DependencyParser(en_vocab, default_parser())
+    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
+    parser = DependencyParser(en_vocab, default_parser(), **config)
     parser.add_label("nsubj")
     return parser
 
diff --git a/spacy/util.py b/spacy/util.py
index bc6c98a82..d2d87bef9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -186,7 +186,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
     return nlp.from_disk(model_path, exclude=disable)
 
 
-def load_model_from_config(nlp_config):
+def load_model_from_config(nlp_config, replace=False):
     if "name" in nlp_config:
         nlp = load_model(**nlp_config)
     elif "lang" in nlp_config:
@@ -197,8 +197,15 @@ def load_model_from_config(nlp_config):
     if "pipeline" in nlp_config:
         for name, component_cfg in nlp_config["pipeline"].items():
             factory = component_cfg.pop("factory")
-            component = nlp.create_pipe(factory, config=component_cfg)
-            nlp.add_pipe(component, name=name)
+            if name in nlp.pipe_names:
+                if replace:
+                    component = nlp.create_pipe(factory, config=component_cfg)
+                    nlp.replace_pipe(name, component)
+                else:
+                    raise ValueError(Errors.E985.format(component=name))
+            else:
+                component = nlp.create_pipe(factory, config=component_cfg)
+                nlp.add_pipe(component, name=name)
     return nlp
 
 
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index b1824573c..180665929 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -46,17 +46,19 @@ Update the evaluation scores from a single [`Doc`](/api/doc) /
 
 ## Properties
 
-| Name                                            | Type  | Description                                                                                                                                               |
-| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `token_acc`                                     | float | Tokenization accuracy.                                                                                                                                    |
-| `tags_acc`                                      | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`).                                                                                        |
-| `uas`                                           | float | Unlabelled dependency score.                                                                                                                              |
-| `las`                                           | float | Labelled dependency score.                                                                                                                                |
-| `ents_p`                                        | float | Named entity accuracy (precision).                                                                                                                        |
-| `ents_r`                                        | float | Named entity accuracy (recall).                                                                                                                           |
-| `ents_f`                                        | float | Named entity accuracy (F-score).                                                                                                                          |
-| `ents_per_type` <Tag variant="new">2.1.5</Tag>  | dict  | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores.                                                                     |
-| `textcat_score` <Tag variant="new">2.2</Tag>    | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). |
-| `textcats_per_cat` <Tag variant="new">2.2</Tag> | dict  | Scores per textcat label, keyed by label.                                                                                                                 |
-| `las_per_type` <Tag variant="new">2.2.3</Tag>   | dict  | Labelled dependency scores, keyed by label.                                                                                                               |
-| `scores`                                        | dict  | All scores, keyed by type.                                                                                                                                |
+| Name                                                | Type  | Description                                                                                                |
+| --------------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- |
+| `token_acc`                                         | float | Tokenization accuracy.                                                                                     |
+| `tags_acc`                                          | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`).                                         |
+| `uas`                                               | float | Unlabelled dependency score.                                                                               |
+| `las`                                               | float | Labelled dependency score.                                                                                 |
+| `ents_p`                                            | float | Named entity accuracy (precision).                                                                         |
+| `ents_r`                                            | float | Named entity accuracy (recall).                                                                            |
+| `ents_f`                                            | float | Named entity accuracy (F-score).                                                                           |
+| `ents_per_type` <Tag variant="new">2.1.5</Tag>      | dict  | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores.                      |
+| `textcat_f` <Tag variant="new">3.0</Tag>            | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. |
+| `textcat_auc` <Tag variant="new"3.0</Tag>           | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined).                            |
+| `textcats_f_per_cat` <Tag variant="new">3.0</Tag>   | dict  | F-scores per textcat label, keyed by label.                                                                |
+| `textcats_auc_per_cat` <Tag variant="new">3.0</Tag> | dict  | ROC AUC scores per textcat label, keyed by label.                                                          |
+| `las_per_type` <Tag variant="new">2.2.3</Tag>       | dict  | Labelled dependency scores, keyed by label.                                                                |
+| `scores`                                            | dict  | All scores, keyed by type.                                                                                 |