From 75a202ce6506177d5de97b47bfd96fd3c7909503 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Sun, 18 Oct 2020 14:50:41 +0200
Subject: [PATCH] TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
---
 spacy/about.py                                |  2 +-
 spacy/cli/init_pipeline.py                    |  2 +-
 spacy/cli/templates/quickstart_training.jinja | 36 +++++++-----
 spacy/cli/train.py                            |  2 +-
 spacy/errors.py                               |  6 +-
 spacy/ml/models/textcat.py                    | 57 ++++++++++++++-----
 spacy/ml/models/tok2vec.py                    |  2 +-
 spacy/pipeline/textcat.py                     | 35 +++++++++---
 spacy/scorer.py                               | 30 ++++++----
 spacy/tests/pipeline/test_pipe_factories.py   | 14 ++---
 spacy/tests/pipeline/test_textcat.py          |  7 +--
 spacy/tests/test_models.py                    | 27 +++------
 spacy/tests/test_scorer.py                    |  6 +-
 spacy/tests/training/test_readers.py          |  2 +-
 spacy/training/initialize.py                  |  4 ++
 website/docs/api/architectures.md             | 56 +++++++++++++-----
 website/docs/api/scorer.md                    | 26 ++++++---
 website/docs/usage/layers-architectures.md    | 35 ++++++++----
 website/docs/usage/processing-pipelines.md    |  9 +--
 website/docs/usage/training.md                |  4 +-
 20 files changed, 235 insertions(+), 127 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index bf1d53a7b..24a3ead22 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0rc1"
+__version__ = "3.0.0rc2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 1c0233539..f45097205 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -100,7 +100,7 @@ def init_labels_cli(
     extract the labels."""
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     if not output_path.exists():
-        output_path.mkdir()
+        output_path.mkdir(parents=True)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index d92de9c15..1194438de 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -136,15 +136,19 @@ factory = "textcat"
 
 {% if optimize == "accuracy" %}
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
-ngram_size = 1
+@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
 
+[components.textcat.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
+
 {% else -%}
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v1"
@@ -271,15 +275,19 @@ factory = "textcat"
 
 {% if optimize == "accuracy" %}
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
-ngram_size = 1
+@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
 
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
+
 {% else -%}
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v1"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 0b27f63dc..fe1e82eb2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -44,7 +44,7 @@ def train_cli(
     if not config_path or not config_path.exists():
         msg.fail("Config file not found", config_path, exits=1)
     if output_path is not None and not output_path.exists():
-        output_path.mkdir()
+        output_path.mkdir(parents=True)
         msg.good(f"Created output directory: {output_path}")
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
diff --git a/spacy/errors.py b/spacy/errors.py
index 5fab0bab1..2898fbcaa 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -398,8 +398,8 @@ class Errors:
     E163 = ("cumsum was found to be unstable: its last element does not "
             "correspond to sum")
     E164 = ("x is neither increasing nor decreasing: {x}.")
-    E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
-            "that case.")
+    E165 = ("Only one class present in the gold labels: {label}. "
+            "ROC AUC score is not defined in that case.")
     E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
@@ -456,6 +456,8 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E897 = ("Field '{field}' should be a dot-notation string referring to the "
+            "relevant section in the config, but found type {type} instead.")
     E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
             "is not set or None. If you've implemented a custom component, make "
             "sure to store the component model as `self.model` in your "
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index ec8998e2d..d4aed2839 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,4 +1,6 @@
-from typing import Optional
+from typing import Optional, List
+
+from thinc.types import Floats2d
 from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
@@ -10,12 +12,13 @@ from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
+from ...tokens import Doc
 
 
 @registry.architectures.register("spacy.TextCatCNN.v1")
 def build_simple_cnn_text_classifier(
     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
-) -> Model:
+) -> Model[List[Doc], Floats2d]:
     """
     Build a simple CNN text classifier, given a token-to-vector model as inputs.
     If exclusive_classes=True, a softmax non-linearity is applied, so that the
@@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
     is applied instead, so that outputs are in the range [0, 1].
     """
     with Model.define_operators({">>": chain}):
+        cnn = tok2vec >> list2ragged() >> reduce_mean()
         if exclusive_classes:
             output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
-            model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
+            model = cnn >> output_layer
             model.set_ref("output_layer", output_layer)
         else:
             linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
-            model = (
-                tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
-            )
+            model = cnn >> linear_layer >> Logistic()
             model.set_ref("output_layer", linear_layer)
     model.set_ref("tok2vec", tok2vec)
     model.set_dim("nO", nO)
@@ -45,8 +47,7 @@ def build_bow_text_classifier(
     ngram_size: int,
     no_output_layer: bool,
     nO: Optional[int] = None,
-) -> Model:
-    # Don't document this yet, I'm not sure it's right.
+) -> Model[List[Doc], Floats2d]:
     with Model.define_operators({">>": chain}):
         sparse_linear = SparseLinear(nO)
         model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@@ -59,6 +60,39 @@ def build_bow_text_classifier(
     return model
 
 
+@registry.architectures.register("spacy.TextCatEnsemble.v2")
+def build_text_classifier(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    linear_model: Model[List[Doc], Floats2d],
+    nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+    exclusive_classes = not linear_model.attrs["multi_label"]
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        width = tok2vec.get_dim("nO")
+        cnn_model = (
+                tok2vec
+                >> list2ragged()
+                >> ParametricAttention(width)   # TODO: benchmark performance difference of this layer
+                >> reduce_sum()
+                >> residual(Maxout(nO=width, nI=width))
+                >> Linear(nO=nO, nI=width)
+                >> Dropout(0.0)
+        )
+
+        nO_double = nO * 2 if nO else None
+        if exclusive_classes:
+            output_layer = Softmax(nO=nO, nI=nO_double)
+        else:
+            output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
+        model = (linear_model | cnn_model) >> output_layer
+        model.set_ref("tok2vec", tok2vec)
+    if model.has_dim("nO") is not False:
+        model.set_dim("nO", nO)
+    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
+    model.attrs["multi_label"] = not exclusive_classes
+    return model
+
+# TODO: move to legacy
 @registry.architectures.register("spacy.TextCatEnsemble.v1")
 def build_text_classifier(
     width: int,
@@ -158,11 +192,8 @@ def build_text_classifier(
 
 @registry.architectures.register("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
-    width: int,
-    pretrained_vectors: Optional[bool],
-    dropout: Optional[float],
-    nO: Optional[int] = None,
-) -> Model:
+    width: int, dropout: Optional[float], nO: Optional[int] = None
+) -> Model[List[Doc], Floats2d]:
     # Don't document this yet, I'm not sure it's right.
     # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
     with Model.define_operators({">>": chain, "**": clone}):
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 95e200927..8755d0d0d 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -106,7 +106,7 @@ def MultiHashEmbed(
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it
-    through a feed-forward subnetwork to build a mixed representations.
+    through a feed-forward subnetwork to build a mixed representation.
 
     The features used can be configured with the 'attrs' argument. The suggested
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 5ebe0e104..0781a000c 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -16,15 +16,30 @@ from ..vocab import Vocab
 
 default_model_config = """
 [model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-pretrained_vectors = null
+@architectures = "spacy.TextCatEnsemble.v2"
+
+[model.tok2vec]
+@architectures = "spacy.Tok2Vec.v1"
+
+[model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
 width = 64
-conv_depth = 2
-embed_size = 2000
+rows = [2000, 2000, 1000, 1000, 1000, 1000]
+attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+include_static_vectors = false
+
+[model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${model.tok2vec.embed.width}
 window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
 ngram_size = 1
-dropout = null
+no_output_layer = false
 """
 DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
 
@@ -60,9 +75,11 @@ subword_features = true
     default_score_weights={
         "cats_score": 1.0,
         "cats_score_desc": None,
-        "cats_p": None,
-        "cats_r": None,
-        "cats_f": None,
+        "cats_micro_p": None,
+        "cats_micro_r": None,
+        "cats_micro_f": None,
+        "cats_macro_p": None,
+        "cats_macro_r": None,
         "cats_macro_f": None,
         "cats_macro_auc": None,
         "cats_f_per_type": None,
diff --git a/spacy/scorer.py b/spacy/scorer.py
index d1065f3a9..273bda898 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -59,7 +59,9 @@ class PRFScore:
 
 
 class ROCAUCScore:
-    """An AUC ROC score."""
+    """An AUC ROC score. This is only defined for binary classification.
+    Use the method is_binary before calculating the score, otherwise it
+    may throw an error."""
 
     def __init__(self) -> None:
         self.golds = []
@@ -71,16 +73,16 @@ class ROCAUCScore:
         self.cands.append(cand)
         self.golds.append(gold)
 
+    def is_binary(self):
+        return len(np.unique(self.golds)) == 2
+
     @property
     def score(self):
+        if not self.is_binary():
+            raise ValueError(Errors.E165.format(label=set(self.golds)))
         if len(self.golds) == self.saved_score_at_len:
             return self.saved_score
-        try:
-            self.saved_score = _roc_auc_score(self.golds, self.cands)
-        # catch ValueError: Only one class present in y_true.
-        # ROC AUC score is not defined in that case.
-        except ValueError:
-            self.saved_score = -float("inf")
+        self.saved_score = _roc_auc_score(self.golds, self.cands)
         self.saved_score_at_len = len(self.golds)
         return self.saved_score
 
@@ -362,9 +364,13 @@ class Scorer:
             for all:
                 attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
                 attr_score_desc (text description of the overall score),
+                attr_micro_p,
+                attr_micro_r,
                 attr_micro_f,
+                attr_macro_p,
+                attr_macro_r,
                 attr_macro_f,
-                attr_auc,
+                attr_macro_auc,
                 attr_f_per_type,
                 attr_auc_per_type
 
@@ -431,7 +437,9 @@ class Scorer:
         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
         macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
-        macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
+        # Limit macro_auc to those labels with gold annotations,
+        # but still divide by all cats to avoid artificial boosting of datasets with missing labels
+        macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
         results = {
             f"{attr}_score": None,
             f"{attr}_score_desc": None,
@@ -443,7 +451,7 @@ class Scorer:
             f"{attr}_macro_f": macro_f,
             f"{attr}_macro_auc": macro_auc,
             f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
-            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
+            f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
         }
         if len(labels) == 2 and not multi_label and positive_label:
             positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
@@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score):
             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
     """
     if len(np.unique(y_true)) != 2:
-        raise ValueError(Errors.E165)
+        raise ValueError(Errors.E165.format(label=np.unique(y_true)))
     fpr, tpr, _ = _roc_curve(y_true, y_score)
     return _auc(fpr, tpr)
 
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index cac394913..6f07c0220 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -2,6 +2,7 @@ import pytest
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
 from spacy.util import registry, SimpleFrozenDict, combine_score_weights
 from thinc.api import Model, Linear, ConfigValidationError
@@ -156,15 +157,10 @@ def test_pipe_class_component_model():
     name = "test_class_component_model"
     default_config = {
         "model": {
-            "@architectures": "spacy.TextCatEnsemble.v1",
-            "exclusive_classes": False,
-            "pretrained_vectors": None,
-            "width": 64,
-            "embed_size": 2000,
-            "window_size": 1,
-            "conv_depth": 2,
-            "ngram_size": 1,
-            "dropout": None,
+            "@architectures": "spacy.TextCatEnsemble.v2",
+            "tok2vec": DEFAULT_TOK2VEC_MODEL,
+            "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
+                      "no_output_layer": False},
         },
         "value1": 10,
     }
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 91348b1b3..06d512a32 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -140,7 +140,7 @@ def test_overfitting_IO():
     nlp = English()
     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
     # Set exclusive labels
-    config = {"model": {"exclusive_classes": True}}
+    config = {"model": {"linear_model": {"exclusive_classes": True}}}
     textcat = nlp.add_pipe("textcat", config=config)
     train_examples = []
     for text, annotations in TRAIN_DATA:
@@ -192,9 +192,8 @@ def test_overfitting_IO():
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
+        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
+        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
     ],
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index e8884e6b2..200d7dcfd 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
 from numpy.testing import assert_array_equal
 import numpy
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
-from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
+from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
 from spacy.ml.staticvectors import StaticVectors
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES
 
 
-def get_textcat_kwargs():
+def get_textcat_bow_kwargs():
     return {
-        "width": 64,
-        "embed_size": 2000,
-        "pretrained_vectors": None,
-        "exclusive_classes": False,
+        "exclusive_classes": True,
         "ngram_size": 1,
-        "window_size": 1,
-        "conv_depth": 2,
-        "dropout": None,
-        "nO": 7,
+        "no_output_layer": False,
+        "nO": 34,
     }
 
 
 def get_textcat_cnn_kwargs():
-    return {
-        "tok2vec": test_tok2vec(),
-        "exclusive_classes": False,
-        "nO": 13,
-    }
+    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
 
 
 def get_all_params(model):
@@ -105,7 +96,7 @@ def test_multi_hash_embed():
     "seed,model_func,kwargs",
     [
         (0, build_Tok2Vec_model, get_tok2vec_kwargs()),
-        (0, build_text_classifier, get_textcat_kwargs()),
+        (0, build_bow_text_classifier, get_textcat_bow_kwargs()),
         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
     ],
 )
@@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
     "seed,model_func,kwargs,get_X",
     [
         (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
-        (0, build_text_classifier, get_textcat_kwargs(), get_docs),
+        (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
     ],
 )
@@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
     "seed,dropout,model_func,kwargs,get_X",
     [
         (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
-        (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs),
+        (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
         (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
     ],
 )
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 4c1b09849..2682cd0ea 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -334,7 +334,8 @@ def test_roc_auc_score():
     score = ROCAUCScore()
     score.score_set(0.25, 0)
     score.score_set(0.75, 0)
-    assert score.score == -float("inf")
+    with pytest.raises(ValueError):
+        s = score.score
 
     y_true = [1, 1]
     y_score = [0.25, 0.75]
@@ -344,4 +345,5 @@ def test_roc_auc_score():
     score = ROCAUCScore()
     score.score_set(0.25, 1)
     score.score_set(0.75, 1)
-    assert score.score == -float("inf")
+    with pytest.raises(ValueError):
+        s = score.score
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 9d82ca50a..ff2559d2a 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -51,7 +51,7 @@ def test_readers():
     for example in train_corpus(nlp):
         nlp.update([example], sgd=optimizer)
     scores = nlp.evaluate(list(dev_corpus(nlp)))
-    assert scores["cats_score"]
+    assert scores["cats_score"] == 0.0
     # ensure the pipeline runs
     doc = nlp("Quick test")
     assert doc.cats
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 7c84caf95..3d79eb78f 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     # Resolve all training-relevant sections using the filled nlp config
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
+    if not isinstance(T["train_corpus"], str):
+        raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
+    if not isinstance(T["dev_corpus"], str):
+        raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     optimizer = T["optimizer"]
     # Components that shouldn't be updated during training
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 3157c261a..fe2223017 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build a mixed representations. The features used
+a feed-forward subnetwork to build a mixed representation. The features used
 can be configured with the `attrs` argument. The suggested attributes are
 `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
 some subword information, without construction a fully character-based
@@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
 different architectures and settings to determine what works best on your
 specific data and challenge.
 
-### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
+### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TextCatEnsemble.v1"
-> exclusive_classes = false
-> pretrained_vectors = null
-> width = 64
-> embed_size = 2000
-> conv_depth = 2
-> window_size = 1
-> ngram_size = 1
-> dropout = null
+> @architectures = "spacy.TextCatEnsemble.v2"
 > nO = null
+>
+> [model.linear_model]
+> @architectures = "spacy.TextCatBOW.v1"
+> exclusive_classes = true
+> ngram_size = 1
+> no_output_layer = false
+>
+> [model.tok2vec]
+> @architectures = "spacy.Tok2Vec.v1"
+>
+> [model.tok2vec.embed]
+> @architectures = "spacy.MultiHashEmbed.v1"
+> width = 64
+> rows = [2000, 2000, 1000, 1000, 1000, 1000]
+> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+> include_static_vectors = false
+>
+> [model.tok2vec.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> width = ${model.tok2vec.embed.width}
+> window_size = 1
+> maxout_pieces = 3
+> depth = 2
 > ```
 
-Stacked ensemble of a bag-of-words model and a neural network model. The neural
-network has an internal CNN Tok2Vec layer and uses attention.
+Stacked ensemble of a linear bag-of-words model and a neural network model. The
+neural network is built upon a Tok2Vec layer and uses attention. The setting for
+whether or not this model should cater for multi-label classification, is taken
+from the linear model, where it is stored in `model.attrs["multi_label"]`.
+
+| Name           | Description                                                                                                                                                                                    |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~                                                                                                                                  |
+| `tok2vec`      | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
+| `nO`           | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**    | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
+
+The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
 
 | Name                 | Description                                                                                                                                                                                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
+</Accordion>
+
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
 
 > #### Example Config
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 0dbc0de33..388d92801 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -174,15 +174,25 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
 ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
 
 Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
-containing scores for each label like `Doc.cats`. The reported overall score
-depends on the scorer settings:
+containing scores for each label like `Doc.cats`. The returned dictionary
+contains the following scores:
 
-1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
-   `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
-   score), `{attr}_f_per_type`, `{attr}_auc_per_type`
-2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
-3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
-4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
+- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
+  each label is weighted equally
+- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
+  across evaluations per label
+- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
+  scores, keyed by label
+- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
+  description)
+
+The reported `{attr}_score` depends on the classification properties:
+
+- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
+  of the positive label
+- **3+ exclusive classes**, macro-averaged F-score:
+  `{attr}_score = {attr}_macro_f`
+- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
 
 > #### Example
 >
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index d7b2593e7..aa62a77d4 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -130,16 +130,31 @@ factory = "textcat"
 labels = []
 
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-pretrained_vectors = null
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
-ngram_size = 1
-dropout = 0
+@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.textcat.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 64
+rows = [2000, 2000, 1000, 1000, 1000, 1000]
+attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+include_static_vectors = false
+
+[components.textcat.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${components.textcat.model.tok2vec.embed.width}
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
 ```
 
 spaCy has two additional built-in `textcat` architectures, and you can easily
@@ -687,7 +702,7 @@ Before the model can be used, it needs to be
 [initialized](/usage/training#initialization). This function receives a callback
 to access the full **training data set**, or a representative sample. This data
 set can be used to deduce all **relevant labels**. Alternatively, a list of
-labels can be provided to `initialize`, or you can call 
+labels can be provided to `initialize`, or you can call
 `RelationExtractor.add_label` directly. The number of labels defines the output
 dimensionality of the network, and will be used to do
 [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index a0cf36909..ef44009ae 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1244,15 +1244,10 @@ labels = []
 # This function is created and then passed to the "textcat" component as
 # the argument "model"
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
+@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
-pretrained_vectors = null
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
 ngram_size = 1
-dropout = null
+no_output_layer = false
 
 [components.other_textcat]
 factory = "textcat"
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 5a42d2172..274ea5989 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -717,7 +717,7 @@ tabular results to a file:
 ```python
 ### functions.py
 import sys
-from typing import IO, Tuple, Callable, Dict, Any
+from typing import IO, Tuple, Callable, Dict, Any, Optional
 import spacy
 from spacy import Language
 from pathlib import Path
@@ -729,7 +729,7 @@ def custom_logger(log_path):
         stdout: IO=sys.stdout,
         stderr: IO=sys.stderr
     ) -> Tuple[Callable, Callable]:
-        stdout.write(f"Logging to {log_path}\n")
+        stdout.write(f"Logging to {log_path}\\n")
         log_file = Path(log_path).open("w", encoding="utf8")
         log_file.write("step\\t")
         log_file.write("score\\t")