diff --git a/pyproject.toml b/pyproject.toml
index 14a2d7690..0ceda4454 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ requires = [
     "murmurhash>=0.28.0,<1.1.0",
     "thinc>=8.0.0rc0,<8.1.0",
     "blis>=0.4.0,<0.8.0",
-    "pytokenizations",
     "pathy"
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 36f0d1e92..3a777f163 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,8 +14,7 @@ pathy
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.5.0,<2.0.0
-pytokenizations
+pydantic>=1.5.0,<1.7.0
 # Official Python utilities
 setuptools
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index adf0c0e20..95ada08ef 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,8 +51,8 @@ install_requires =
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0
     requests>=2.13.0,<3.0.0
-    pydantic>=1.5.0,<2.0.0
-    pytokenizations
+    pydantic>=1.5.0,<1.7.0
+    jinja2
     # Official Python utilities
     setuptools
     packaging>=20.0
diff --git a/setup.py b/setup.py
index 604d65745..160d2ed1c 100755
--- a/setup.py
+++ b/setup.py
@@ -49,6 +49,7 @@ MOD_NAMES = [
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.tokenizer",
+    "spacy.training.align",
     "spacy.training.gold_io",
     "spacy.tokens.doc",
     "spacy.tokens.span",
diff --git a/spacy/about.py b/spacy/about.py
index bf1d53a7b..24a3ead22 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0rc1"
+__version__ = "3.0.0rc2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 566820283..a0ea9fbc9 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -93,27 +93,42 @@ def evaluate(
         "SPEED": "speed",
     }
     results = {}
+    data = {}
     for metric, key in metrics.items():
         if key in scores:
             if key == "cats_score":
                 metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
-            if key == "speed":
-                results[metric] = f"{scores[key]:.0f}"
+            if isinstance(scores[key], (int, float)):
+                if key == "speed":
+                    results[metric] = f"{scores[key]:.0f}"
+                else:
+                    results[metric] = f"{scores[key]*100:.2f}"
             else:
-                results[metric] = f"{scores[key]*100:.2f}"
-    data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
+                results[metric] = "-"
+            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
 
     msg.table(results, title="Results")
 
+    if "morph_per_feat" in scores:
+        if scores["morph_per_feat"]:
+            print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
+            data["morph_per_feat"] = scores["morph_per_feat"]
+    if "dep_las_per_type" in scores:
+        if scores["dep_las_per_type"]:
+            print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
+            data["dep_las_per_type"] = scores["dep_las_per_type"]
     if "ents_per_type" in scores:
         if scores["ents_per_type"]:
-            print_ents_per_type(msg, scores["ents_per_type"])
+            print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
+            data["ents_per_type"] = scores["ents_per_type"]
     if "cats_f_per_type" in scores:
         if scores["cats_f_per_type"]:
-            print_textcats_f_per_cat(msg, scores["cats_f_per_type"])
+            print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
+            data["cats_f_per_type"] = scores["cats_f_per_type"]
     if "cats_auc_per_type" in scores:
         if scores["cats_auc_per_type"]:
             print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
+            data["cats_auc_per_type"] = scores["cats_auc_per_type"]
 
     if displacy_path:
         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
@@ -157,7 +172,7 @@ def render_parses(
             file_.write(html)
 
 
-def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
+def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
     data = [
         (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
         for k, v in scores.items()
@@ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No
         data,
         header=("", "P", "R", "F"),
         aligns=("l", "r", "r", "r"),
-        title="NER (per type)",
-    )
-
-
-def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
-    data = [
-        (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
-        for k, v in scores.items()
-    ]
-    msg.table(
-        data,
-        header=("", "P", "R", "F"),
-        aligns=("l", "r", "r", "r"),
-        title="Textcat F (per label)",
+        title=f"{name} (per {type})",
     )
 
 
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 1c0233539..da474795e 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -39,7 +39,7 @@ def init_vectors_cli(
     nlp.to_disk(output_dir)
     msg.good(
         "Saved nlp object with vectors to output directory. You can now use the "
-        "path to it in your config as the 'vectors' setting in [initialize.vocab].",
+        "path to it in your config as the 'vectors' setting in [initialize].",
         output_dir.resolve(),
     )
 
@@ -100,7 +100,7 @@ def init_labels_cli(
     extract the labels."""
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     if not output_path.exists():
-        output_path.mkdir()
+        output_path.mkdir(parents=True)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index d92de9c15..1194438de 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -136,15 +136,19 @@ factory = "textcat"
 
 {% if optimize == "accuracy" %}
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
-ngram_size = 1
+@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
 
+[components.textcat.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
+
 {% else -%}
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v1"
@@ -271,15 +275,19 @@ factory = "textcat"
 
 {% if optimize == "accuracy" %}
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
-ngram_size = 1
+@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
 
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
+
 {% else -%}
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v1"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 0b27f63dc..fe1e82eb2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -44,7 +44,7 @@ def train_cli(
     if not config_path or not config_path.exists():
         msg.fail("Config file not found", config_path, exits=1)
     if output_path is not None and not output_path.exists():
-        output_path.mkdir()
+        output_path.mkdir(parents=True)
         msg.good(f"Created output directory: {output_path}")
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
diff --git a/spacy/errors.py b/spacy/errors.py
index 5fab0bab1..f4fd3731f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -398,8 +398,8 @@ class Errors:
     E163 = ("cumsum was found to be unstable: its last element does not "
             "correspond to sum")
     E164 = ("x is neither increasing nor decreasing: {x}.")
-    E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
-            "that case.")
+    E165 = ("Only one class present in the gold labels: {label}. "
+            "ROC AUC score is not defined in that case.")
     E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
@@ -456,6 +456,8 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E897 = ("Field '{field}' should be a dot-notation string referring to the "
+            "relevant section in the config, but found type {type} instead.")
     E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
             "is not set or None. If you've implemented a custom component, make "
             "sure to store the component model as `self.model` in your "
@@ -562,7 +564,10 @@ class Errors:
             "a string value from {expected} but got: '{arg}'")
     E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
-    E949 = ("Can only create an alignment when the texts are the same.")
+    E949 = ("Unable to align tokens for the predicted and reference docs. It "
+            "is only possible to align the docs when both texts are the same "
+            "except for whitespace and capitalization. The predicted tokens "
+            "start with: {x}. The reference tokens start with: {y}.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 067b2167c..02f7c9318 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -286,10 +286,10 @@ cdef class DependencyMatcher:
                 self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
                 for matched_tree in matched_trees:
                     matched_key_trees.append((key, matched_tree))
-            for i, (match_id, nodes) in enumerate(matched_key_trees):
-                on_match = self._callbacks.get(match_id)
-                if on_match is not None:
-                    on_match(self, doc, i, matched_key_trees)
+        for i, (match_id, nodes) in enumerate(matched_key_trees):
+            on_match = self._callbacks.get(match_id)
+            if on_match is not None:
+                on_match(self, doc, i, matched_key_trees)
         return matched_key_trees
 
     def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index ec8998e2d..d4aed2839 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,4 +1,6 @@
-from typing import Optional
+from typing import Optional, List
+
+from thinc.types import Floats2d
 from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
@@ -10,12 +12,13 @@ from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
+from ...tokens import Doc
 
 
 @registry.architectures.register("spacy.TextCatCNN.v1")
 def build_simple_cnn_text_classifier(
     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
-) -> Model:
+) -> Model[List[Doc], Floats2d]:
     """
     Build a simple CNN text classifier, given a token-to-vector model as inputs.
     If exclusive_classes=True, a softmax non-linearity is applied, so that the
@@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier(
     is applied instead, so that outputs are in the range [0, 1].
     """
     with Model.define_operators({">>": chain}):
+        cnn = tok2vec >> list2ragged() >> reduce_mean()
         if exclusive_classes:
             output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
-            model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
+            model = cnn >> output_layer
             model.set_ref("output_layer", output_layer)
         else:
             linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
-            model = (
-                tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
-            )
+            model = cnn >> linear_layer >> Logistic()
             model.set_ref("output_layer", linear_layer)
     model.set_ref("tok2vec", tok2vec)
     model.set_dim("nO", nO)
@@ -45,8 +47,7 @@ def build_bow_text_classifier(
     ngram_size: int,
     no_output_layer: bool,
     nO: Optional[int] = None,
-) -> Model:
-    # Don't document this yet, I'm not sure it's right.
+) -> Model[List[Doc], Floats2d]:
     with Model.define_operators({">>": chain}):
         sparse_linear = SparseLinear(nO)
         model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@@ -59,6 +60,39 @@ def build_bow_text_classifier(
     return model
 
 
+@registry.architectures.register("spacy.TextCatEnsemble.v2")
+def build_text_classifier(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    linear_model: Model[List[Doc], Floats2d],
+    nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+    exclusive_classes = not linear_model.attrs["multi_label"]
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        width = tok2vec.get_dim("nO")
+        cnn_model = (
+                tok2vec
+                >> list2ragged()
+                >> ParametricAttention(width)   # TODO: benchmark performance difference of this layer
+                >> reduce_sum()
+                >> residual(Maxout(nO=width, nI=width))
+                >> Linear(nO=nO, nI=width)
+                >> Dropout(0.0)
+        )
+
+        nO_double = nO * 2 if nO else None
+        if exclusive_classes:
+            output_layer = Softmax(nO=nO, nI=nO_double)
+        else:
+            output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
+        model = (linear_model | cnn_model) >> output_layer
+        model.set_ref("tok2vec", tok2vec)
+    if model.has_dim("nO") is not False:
+        model.set_dim("nO", nO)
+    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
+    model.attrs["multi_label"] = not exclusive_classes
+    return model
+
+# TODO: move to legacy
 @registry.architectures.register("spacy.TextCatEnsemble.v1")
 def build_text_classifier(
     width: int,
@@ -158,11 +192,8 @@ def build_text_classifier(
 
 @registry.architectures.register("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
-    width: int,
-    pretrained_vectors: Optional[bool],
-    dropout: Optional[float],
-    nO: Optional[int] = None,
-) -> Model:
+    width: int, dropout: Optional[float], nO: Optional[int] = None
+) -> Model[List[Doc], Floats2d]:
     # Don't document this yet, I'm not sure it's right.
     # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
     with Model.define_operators({">>": chain, "**": clone}):
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 95e200927..8755d0d0d 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -106,7 +106,7 @@ def MultiHashEmbed(
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it
-    through a feed-forward subnetwork to build a mixed representations.
+    through a feed-forward subnetwork to build a mixed representation.
 
     The features used can be configured with the 'attrs' argument. The suggested
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index e17d3be98..68e26c4be 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -226,6 +226,9 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/tagger#score
         """
+        def morph_key_getter(token, attr):
+            return getattr(token, attr).key
+
         validate_examples(examples, "AttributeRuler.score")
         results = {}
         attrs = set()
@@ -237,7 +240,8 @@ class AttributeRuler(Pipe):
             elif attr == POS:
                 results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
             elif attr == MORPH:
-                results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
+                results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
+                results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs))
             elif attr == LEMMA:
                 results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
         return results
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index bdef332cc..a9dcd705e 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -155,13 +155,16 @@ cdef class DependencyParser(Parser):
 
         DOCS: https://nightly.spacy.io/api/dependencyparser#score
         """
+        def has_sents(doc):
+            return doc.has_annotation("SENT_START")
+
         validate_examples(examples, "DependencyParser.score")
         def dep_getter(token, attr):
             dep = getattr(token, attr)
             dep = token.vocab.strings.as_string(dep).lower()
             return dep
         results = {}
-        results.update(Scorer.score_spans(examples, "sents", **kwargs))
+        results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
         kwargs.setdefault("getter", dep_getter)
         kwargs.setdefault("ignore_labels", ("p", "punct"))
         results.update(Scorer.score_deps(examples, "dep", **kwargs))
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 382ca338d..2a3b8dd00 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -10,7 +10,7 @@ from ..errors import Errors
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
-from ..scorer import Scorer
+from ..scorer import get_ner_prf
 from ..training import validate_examples
 
 
@@ -340,7 +340,7 @@ class EntityRuler(Pipe):
 
     def score(self, examples, **kwargs):
         validate_examples(examples, "EntityRuler.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
+        return get_ner_prf(examples)
 
     def from_bytes(
         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index ac111f28b..a03c7daf0 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -251,10 +251,13 @@ class Morphologizer(Tagger):
 
         DOCS: https://nightly.spacy.io/api/morphologizer#score
         """
+        def morph_key_getter(token, attr):
+            return getattr(token, attr).key
+
         validate_examples(examples, "Morphologizer.score")
         results = {}
         results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-        results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
+        results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
         results.update(Scorer.score_token_attr_per_feat(examples,
-            "morph", **kwargs))
+            "morph", getter=morph_key_getter, **kwargs))
         return results
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 6482d6125..0f93b43ac 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser):
         DOCS: https://nightly.spacy.io/api/entityrecognizer#score
         """
         validate_examples(examples, "EntityRecognizer.score")
-        score_per_type = get_ner_prf(examples)
-        totals = PRFScore()
-        for prf in score_per_type.values():
-            totals += prf
-        return {
-            "ents_p": totals.precision,
-            "ents_r": totals.recall,
-            "ents_f": totals.fscore,
-            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-        }
+        return get_ner_prf(examples)
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 7656b330c..6e8b1c324 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -155,8 +155,11 @@ class Sentencizer(Pipe):
 
         DOCS: https://nightly.spacy.io/api/sentencizer#score
         """
+        def has_sents(doc):
+            return doc.has_annotation("SENT_START")
+
         validate_examples(examples, "Sentencizer.score")
-        results = Scorer.score_spans(examples, "sents", **kwargs)
+        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
         del results["sents_per_type"]
         return results
 
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 15a21902a..ad777ea58 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -160,7 +160,10 @@ class SentenceRecognizer(Tagger):
         RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
         DOCS: https://nightly.spacy.io/api/sentencerecognizer#score
         """
+        def has_sents(doc):
+            return doc.has_annotation("SENT_START")
+
         validate_examples(examples, "SentenceRecognizer.score")
-        results = Scorer.score_spans(examples, "sents", **kwargs)
+        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
         del results["sents_per_type"]
         return results
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 5ebe0e104..0781a000c 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -16,15 +16,30 @@ from ..vocab import Vocab
 
 default_model_config = """
 [model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-pretrained_vectors = null
+@architectures = "spacy.TextCatEnsemble.v2"
+
+[model.tok2vec]
+@architectures = "spacy.Tok2Vec.v1"
+
+[model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
 width = 64
-conv_depth = 2
-embed_size = 2000
+rows = [2000, 2000, 1000, 1000, 1000, 1000]
+attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+include_static_vectors = false
+
+[model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${model.tok2vec.embed.width}
 window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
 ngram_size = 1
-dropout = null
+no_output_layer = false
 """
 DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"]
 
@@ -60,9 +75,11 @@ subword_features = true
     default_score_weights={
         "cats_score": 1.0,
         "cats_score_desc": None,
-        "cats_p": None,
-        "cats_r": None,
-        "cats_f": None,
+        "cats_micro_p": None,
+        "cats_micro_r": None,
+        "cats_micro_f": None,
+        "cats_macro_p": None,
+        "cats_macro_r": None,
         "cats_macro_f": None,
         "cats_macro_auc": None,
         "cats_f_per_type": None,
diff --git a/spacy/scorer.py b/spacy/scorer.py
index d1065f3a9..fe64c23ad 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,9 +1,9 @@
-from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
+from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
 import numpy as np
 from collections import defaultdict
 
 from .training import Example
-from .tokens import Token, Doc, Span
+from .tokens import Token, Doc, Span, MorphAnalysis
 from .errors import Errors
 from .util import get_lang_class, SimpleFrozenList
 from .morphology import Morphology
@@ -13,7 +13,8 @@ if TYPE_CHECKING:
     from .language import Language  # noqa: F401
 
 
-DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"]
+DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
+MISSING_VALUES = frozenset([None, 0, ""])
 
 
 class PRFScore:
@@ -24,6 +25,9 @@ class PRFScore:
         self.fp = 0
         self.fn = 0
 
+    def __len__(self) -> int:
+        return self.tp + self.fp + self.fn
+
     def __iadd__(self, other):
         self.tp += other.tp
         self.fp += other.fp
@@ -59,7 +63,9 @@ class PRFScore:
 
 
 class ROCAUCScore:
-    """An AUC ROC score."""
+    """An AUC ROC score. This is only defined for binary classification.
+    Use the method is_binary before calculating the score, otherwise it
+    may throw an error."""
 
     def __init__(self) -> None:
         self.golds = []
@@ -71,16 +77,16 @@ class ROCAUCScore:
         self.cands.append(cand)
         self.golds.append(gold)
 
+    def is_binary(self):
+        return len(np.unique(self.golds)) == 2
+
     @property
     def score(self):
+        if not self.is_binary():
+            raise ValueError(Errors.E165.format(label=set(self.golds)))
         if len(self.golds) == self.saved_score_at_len:
             return self.saved_score
-        try:
-            self.saved_score = _roc_auc_score(self.golds, self.cands)
-        # catch ValueError: Only one class present in y_true.
-        # ROC AUC score is not defined in that case.
-        except ValueError:
-            self.saved_score = -float("inf")
+        self.saved_score = _roc_auc_score(self.golds, self.cands)
         self.saved_score_at_len = len(self.golds)
         return self.saved_score
 
@@ -92,7 +98,7 @@ class Scorer:
         self,
         nlp: Optional["Language"] = None,
         default_lang: str = "xx",
-        default_pipeline=DEFAULT_PIPELINE,
+        default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
         **cfg,
     ) -> None:
         """Initialize the Scorer.
@@ -124,13 +130,13 @@ class Scorer:
         return scores
 
     @staticmethod
-    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]:
+    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
         """Returns accuracy and PRF scores for tokenization.
         * token_acc: # correct tokens / # gold tokens
         * token_p/r/f: PRF for token character spans
 
         examples (Iterable[Example]): Examples to score
-        RETURNS (Dict[str, float]): A dictionary containing the scores
+        RETURNS (Dict[str, Any]): A dictionary containing the scores
             token_acc/p/r/f.
 
         DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
@@ -140,6 +146,8 @@ class Scorer:
         for example in examples:
             gold_doc = example.reference
             pred_doc = example.predicted
+            if gold_doc.has_unknown_spaces:
+                continue
             align = example.alignment
             gold_spans = set()
             pred_spans = set()
@@ -156,12 +164,20 @@ class Scorer:
                 else:
                     acc_score.tp += 1
             prf_score.score_set(pred_spans, gold_spans)
-        return {
-            "token_acc": acc_score.fscore,
-            "token_p": prf_score.precision,
-            "token_r": prf_score.recall,
-            "token_f": prf_score.fscore,
-        }
+        if len(acc_score) > 0:
+            return {
+                "token_acc": acc_score.fscore,
+                "token_p": prf_score.precision,
+                "token_r": prf_score.recall,
+                "token_f": prf_score.fscore,
+            }
+        else:
+            return {
+                "token_acc": None,
+                "token_p": None,
+                "token_r": None,
+                "token_f": None
+            }
 
     @staticmethod
     def score_token_attr(
@@ -169,8 +185,9 @@ class Scorer:
         attr: str,
         *,
         getter: Callable[[Token, str], Any] = getattr,
+        missing_values: Set[Any] = MISSING_VALUES,
         **cfg,
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Any]:
         """Returns an accuracy score for a token-level attribute.
 
         examples (Iterable[Example]): Examples to score
@@ -178,7 +195,7 @@ class Scorer:
         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
             getter(token, attr) should return the value of the attribute for an
             individual token.
-        RETURNS (Dict[str, float]): A dictionary containing the accuracy score
+        RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
             under the key attr_acc.
 
         DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
@@ -189,17 +206,27 @@ class Scorer:
             pred_doc = example.predicted
             align = example.alignment
             gold_tags = set()
+            missing_indices = set()
             for gold_i, token in enumerate(gold_doc):
-                gold_tags.add((gold_i, getter(token, attr)))
+                value = getter(token, attr)
+                if value not in missing_values:
+                    gold_tags.add((gold_i, getter(token, attr)))
+                else:
+                    missing_indices.add(gold_i)
             pred_tags = set()
             for token in pred_doc:
                 if token.orth_.isspace():
                     continue
                 if align.x2y.lengths[token.i] == 1:
                     gold_i = align.x2y[token.i].dataXd[0, 0]
-                    pred_tags.add((gold_i, getter(token, attr)))
+                    if gold_i not in missing_indices:
+                        pred_tags.add((gold_i, getter(token, attr)))
             tag_score.score_set(pred_tags, gold_tags)
-        return {f"{attr}_acc": tag_score.fscore}
+        score_key = f"{attr}_acc"
+        if len(tag_score) == 0:
+            return {score_key: None}
+        else:
+            return {score_key: tag_score.fscore}
 
     @staticmethod
     def score_token_attr_per_feat(
@@ -207,8 +234,9 @@ class Scorer:
         attr: str,
         *,
         getter: Callable[[Token, str], Any] = getattr,
+        missing_values: Set[Any] = MISSING_VALUES,
         **cfg,
-    ):
+    ) -> Dict[str, Any]:
         """Return PRF scores per feat for a token attribute in UFEATS format.
 
         examples (Iterable[Example]): Examples to score
@@ -216,7 +244,7 @@ class Scorer:
         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
             getter(token, attr) should return the value of the attribute for an
             individual token.
-        RETURNS (dict): A dictionary containing the per-feat PRF scores unders
+        RETURNS (dict): A dictionary containing the per-feat PRF scores under
             the key attr_per_feat.
         """
         per_feat = {}
@@ -225,9 +253,11 @@ class Scorer:
             gold_doc = example.reference
             align = example.alignment
             gold_per_feat = {}
+            missing_indices = set()
             for gold_i, token in enumerate(gold_doc):
-                morph = str(getter(token, attr))
-                if morph:
+                value = getter(token, attr)
+                morph = gold_doc.vocab.strings[value]
+                if value not in missing_values and morph != Morphology.EMPTY_MORPH:
                     for feat in morph.split(Morphology.FEATURE_SEP):
                         field, values = feat.split(Morphology.FIELD_SEP)
                         if field not in per_feat:
@@ -235,27 +265,35 @@ class Scorer:
                         if field not in gold_per_feat:
                             gold_per_feat[field] = set()
                         gold_per_feat[field].add((gold_i, feat))
+                else:
+                    missing_indices.add(gold_i)
             pred_per_feat = {}
             for token in pred_doc:
                 if token.orth_.isspace():
                     continue
                 if align.x2y.lengths[token.i] == 1:
                     gold_i = align.x2y[token.i].dataXd[0, 0]
-                    morph = str(getter(token, attr))
-                    if morph:
-                        for feat in morph.split("|"):
-                            field, values = feat.split("=")
-                            if field not in per_feat:
-                                per_feat[field] = PRFScore()
-                            if field not in pred_per_feat:
-                                pred_per_feat[field] = set()
-                            pred_per_feat[field].add((gold_i, feat))
+                    if gold_i not in missing_indices:
+                        value = getter(token, attr)
+                        morph = gold_doc.vocab.strings[value]
+                        if value not in missing_values and morph != Morphology.EMPTY_MORPH:
+                            for feat in morph.split(Morphology.FEATURE_SEP):
+                                field, values = feat.split(Morphology.FIELD_SEP)
+                                if field not in per_feat:
+                                    per_feat[field] = PRFScore()
+                                if field not in pred_per_feat:
+                                    pred_per_feat[field] = set()
+                                pred_per_feat[field].add((gold_i, feat))
             for field in per_feat:
                 per_feat[field].score_set(
                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                 )
-        result = {k: v.to_dict() for k, v in per_feat.items()}
-        return {f"{attr}_per_feat": result}
+        score_key = f"{attr}_per_feat"
+        if any([len(v) for v in per_feat.values()]):
+            result = {k: v.to_dict() for k, v in per_feat.items()}
+            return {score_key: result}
+        else:
+            return {score_key: None}
 
     @staticmethod
     def score_spans(
@@ -263,6 +301,7 @@ class Scorer:
         attr: str,
         *,
         getter: Callable[[Doc, str], Iterable[Span]] = getattr,
+        has_annotation: Optional[Callable[[Doc], bool]] = None,
         **cfg,
     ) -> Dict[str, Any]:
         """Returns PRF scores for labeled spans.
@@ -282,18 +321,10 @@ class Scorer:
         for example in examples:
             pred_doc = example.predicted
             gold_doc = example.reference
-            # TODO
-            # This is a temporary hack to work around the problem that the scorer
-            # fails if you have examples that are not fully annotated for all
-            # the tasks in your pipeline. For instance, you might have a corpus
-            # of NER annotations that does not set sentence boundaries, but the
-            # pipeline includes a parser or senter, and then the score_weights
-            # are used to evaluate that component. When the scorer attempts
-            # to read the sentences from the gold document, it fails.
-            try:
-                list(getter(gold_doc, attr))
-            except ValueError:
-                continue
+            # Option to handle docs without sents
+            if has_annotation is not None:
+                if not has_annotation(gold_doc):
+                    continue
             # Find all labels in gold and doc
             labels = set(
                 [k.label_ for k in getter(gold_doc, attr)]
@@ -321,13 +352,21 @@ class Scorer:
                     v.score_set(pred_per_type[k], gold_per_type[k])
             # Score for all labels
             score.score_set(pred_spans, gold_spans)
-        results = {
-            f"{attr}_p": score.precision,
-            f"{attr}_r": score.recall,
-            f"{attr}_f": score.fscore,
-            f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-        }
-        return results
+        if len(score) > 0:
+            return {
+                f"{attr}_p": score.precision,
+                f"{attr}_r": score.recall,
+                f"{attr}_f": score.fscore,
+                f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+            }
+        else:
+            return {
+                f"{attr}_p": None,
+                f"{attr}_r": None,
+                f"{attr}_f": None,
+                f"{attr}_per_type": None,
+            }
+
 
     @staticmethod
     def score_cats(
@@ -362,9 +401,13 @@ class Scorer:
             for all:
                 attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
                 attr_score_desc (text description of the overall score),
+                attr_micro_p,
+                attr_micro_r,
                 attr_micro_f,
+                attr_macro_p,
+                attr_macro_r,
                 attr_macro_f,
-                attr_auc,
+                attr_macro_auc,
                 attr_f_per_type,
                 attr_auc_per_type
 
@@ -384,9 +427,6 @@ class Scorer:
             pred_cats = getter(example.predicted, attr)
             gold_cats = getter(example.reference, attr)
 
-            # I think the AUC metric is applicable regardless of whether we're
-            # doing multi-label classification? Unsure. If not, move this into
-            # the elif pred_cats and gold_cats block below.
             for label in labels:
                 pred_score = pred_cats.get(label, 0.0)
                 gold_score = gold_cats.get(label, 0.0)
@@ -431,7 +471,9 @@ class Scorer:
         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
         macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
-        macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats
+        # Limit macro_auc to those labels with gold annotations,
+        # but still divide by all cats to avoid artificial boosting of datasets with missing labels
+        macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
         results = {
             f"{attr}_score": None,
             f"{attr}_score_desc": None,
@@ -443,7 +485,7 @@ class Scorer:
             f"{attr}_macro_f": macro_f,
             f"{attr}_macro_auc": macro_auc,
             f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
-            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
+            f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
         }
         if len(labels) == 2 and not multi_label and positive_label:
             positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
@@ -534,6 +576,7 @@ class Scorer:
         head_attr: str = "head",
         head_getter: Callable[[Token, str], Token] = getattr,
         ignore_labels: Iterable[str] = SimpleFrozenList(),
+        missing_values: Set[Any] = MISSING_VALUES,
         **cfg,
     ) -> Dict[str, Any]:
         """Returns the UAS, LAS, and LAS per type scores for dependency
@@ -558,6 +601,7 @@ class Scorer:
         unlabelled = PRFScore()
         labelled = PRFScore()
         labelled_per_dep = dict()
+        missing_indices = set()
         for example in examples:
             gold_doc = example.reference
             pred_doc = example.predicted
@@ -567,13 +611,16 @@ class Scorer:
             for gold_i, token in enumerate(gold_doc):
                 dep = getter(token, attr)
                 head = head_getter(token, head_attr)
-                if dep not in ignore_labels:
-                    gold_deps.add((gold_i, head.i, dep))
-                    if dep not in labelled_per_dep:
-                        labelled_per_dep[dep] = PRFScore()
-                    if dep not in gold_deps_per_dep:
-                        gold_deps_per_dep[dep] = set()
-                    gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+                if dep not in missing_values:
+                    if dep not in ignore_labels:
+                        gold_deps.add((gold_i, head.i, dep))
+                        if dep not in labelled_per_dep:
+                            labelled_per_dep[dep] = PRFScore()
+                        if dep not in gold_deps_per_dep:
+                            gold_deps_per_dep[dep] = set()
+                        gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+                else:
+                    missing_indices.add(gold_i)
             pred_deps = set()
             pred_deps_per_dep = {}
             for token in pred_doc:
@@ -583,25 +630,26 @@ class Scorer:
                     gold_i = None
                 else:
                     gold_i = align.x2y[token.i].dataXd[0, 0]
-                dep = getter(token, attr)
-                head = head_getter(token, head_attr)
-                if dep not in ignore_labels and token.orth_.strip():
-                    if align.x2y.lengths[head.i] == 1:
-                        gold_head = align.x2y[head.i].dataXd[0, 0]
-                    else:
-                        gold_head = None
-                    # None is indistinct, so we can't just add it to the set
-                    # Multiple (None, None) deps are possible
-                    if gold_i is None or gold_head is None:
-                        unlabelled.fp += 1
-                        labelled.fp += 1
-                    else:
-                        pred_deps.add((gold_i, gold_head, dep))
-                        if dep not in labelled_per_dep:
-                            labelled_per_dep[dep] = PRFScore()
-                        if dep not in pred_deps_per_dep:
-                            pred_deps_per_dep[dep] = set()
-                        pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
+                if gold_i not in missing_indices:
+                    dep = getter(token, attr)
+                    head = head_getter(token, head_attr)
+                    if dep not in ignore_labels and token.orth_.strip():
+                        if align.x2y.lengths[head.i] == 1:
+                            gold_head = align.x2y[head.i].dataXd[0, 0]
+                        else:
+                            gold_head = None
+                        # None is indistinct, so we can't just add it to the set
+                        # Multiple (None, None) deps are possible
+                        if gold_i is None or gold_head is None:
+                            unlabelled.fp += 1
+                            labelled.fp += 1
+                        else:
+                            pred_deps.add((gold_i, gold_head, dep))
+                            if dep not in labelled_per_dep:
+                                labelled_per_dep[dep] = PRFScore()
+                            if dep not in pred_deps_per_dep:
+                                pred_deps_per_dep[dep] = set()
+                            pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
             labelled.score_set(pred_deps, gold_deps)
             for dep in labelled_per_dep:
                 labelled_per_dep[dep].score_set(
@@ -610,29 +658,34 @@ class Scorer:
             unlabelled.score_set(
                 set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
             )
-        return {
-            f"{attr}_uas": unlabelled.fscore,
-            f"{attr}_las": labelled.fscore,
-            f"{attr}_las_per_type": {
-                k: v.to_dict() for k, v in labelled_per_dep.items()
-            },
-        }
+        if len(unlabelled) > 0:
+            return {
+                f"{attr}_uas": unlabelled.fscore,
+                f"{attr}_las": labelled.fscore,
+                f"{attr}_las_per_type": {
+                    k: v.to_dict() for k, v in labelled_per_dep.items()
+                },
+            }
+        else:
+            return {
+                f"{attr}_uas": None,
+                f"{attr}_las": None,
+                f"{attr}_las_per_type": None,
+            }
 
 
-def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
-    """Compute per-entity PRFScore objects for a sequence of examples. The
-    results are returned as a dictionary keyed by the entity type. You can
-    add the PRFScore objects to get micro-averaged total.
+def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
+    """Compute micro-PRF and per-entity PRF scores for a sequence of examples.
     """
-    scores = defaultdict(PRFScore)
+    score_per_type = defaultdict(PRFScore)
     for eg in examples:
         if not eg.y.has_annotation("ENT_IOB"):
             continue
         golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
         align_x2y = eg.alignment.x2y
         for pred_ent in eg.x.ents:
-            if pred_ent.label_ not in scores:
-                scores[pred_ent.label_] = PRFScore()
+            if pred_ent.label_ not in score_per_type:
+                score_per_type[pred_ent.label_] = PRFScore()
             indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
             if len(indices):
                 g_span = eg.y[indices[0] : indices[-1] + 1]
@@ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
                 if all(token.ent_iob != 0 for token in g_span):
                     key = (pred_ent.label_, indices[0], indices[-1] + 1)
                     if key in golds:
-                        scores[pred_ent.label_].tp += 1
+                        score_per_type[pred_ent.label_].tp += 1
                         golds.remove(key)
                     else:
-                        scores[pred_ent.label_].fp += 1
+                        score_per_type[pred_ent.label_].fp += 1
         for label, start, end in golds:
-            scores[label].fn += 1
-    return scores
+            score_per_type[label].fn += 1
+    totals = PRFScore()
+    for prf in score_per_type.values():
+        totals += prf
+    if len(totals) > 0:
+        return {
+            "ents_p": totals.precision,
+            "ents_r": totals.recall,
+            "ents_f": totals.fscore,
+            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
+    else:
+        return {
+            "ents_p": None,
+            "ents_r": None,
+            "ents_f": None,
+            "ents_per_type": None,
+        }
 
 
 #############################################################################
@@ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score):
             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
     """
     if len(np.unique(y_true)) != 2:
-        raise ValueError(Errors.E165)
+        raise ValueError(Errors.E165.format(label=np.unique(y_true)))
     fpr, tpr, _ = _roc_curve(y_true, y_score)
     return _auc(fpr, tpr)
 
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index e18a8f6d8..481187348 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc):
     pattern = [
         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
     ]
+    nomatch_pattern = [
+        {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}},
+    ]
 
     matcher = DependencyMatcher(en_vocab)
     mock = Mock()
     matcher.add("pattern", [pattern], on_match=mock)
+    matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock)
     matches = matcher(doc)
+    assert len(matches) == 1
     mock.assert_called_once_with(matcher, doc, 0, matches)
 
     # check that matches with and without callback are the same (#4590)
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 6c66469cc..02726172b 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts):
     scores = nlp.evaluate(dev_examples)
     # "cat" is the only correct lemma
     assert scores["lemma_acc"] == pytest.approx(0.2)
-    # the empty morphs are correct
-    assert scores["morph_acc"] == pytest.approx(0.6)
+    # no morphs are set
+    assert scores["morph_acc"] == None
 
 
 def test_attributeruler_rule_order(nlp):
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index cac394913..6f07c0220 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -2,6 +2,7 @@ import pytest
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
 from spacy.util import registry, SimpleFrozenDict, combine_score_weights
 from thinc.api import Model, Linear, ConfigValidationError
@@ -156,15 +157,10 @@ def test_pipe_class_component_model():
     name = "test_class_component_model"
     default_config = {
         "model": {
-            "@architectures": "spacy.TextCatEnsemble.v1",
-            "exclusive_classes": False,
-            "pretrained_vectors": None,
-            "width": 64,
-            "embed_size": 2000,
-            "window_size": 1,
-            "conv_depth": 2,
-            "ngram_size": 1,
-            "dropout": None,
+            "@architectures": "spacy.TextCatEnsemble.v2",
+            "tok2vec": DEFAULT_TOK2VEC_MODEL,
+            "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1,
+                      "no_output_layer": False},
         },
         "value1": 10,
     }
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 91348b1b3..06d512a32 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -140,7 +140,7 @@ def test_overfitting_IO():
     nlp = English()
     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
     # Set exclusive labels
-    config = {"model": {"exclusive_classes": True}}
+    config = {"model": {"linear_model": {"exclusive_classes": True}}}
     textcat = nlp.add_pipe("textcat", config=config)
     train_examples = []
     for text, annotations in TRAIN_DATA:
@@ -192,9 +192,8 @@ def test_overfitting_IO():
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
-        {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
+        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}},
+        {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}},
         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True},
         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False},
     ],
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index e8884e6b2..200d7dcfd 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate
 from numpy.testing import assert_array_equal
 import numpy
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
-from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
+from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
 from spacy.ml.staticvectors import StaticVectors
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES
 
 
-def get_textcat_kwargs():
+def get_textcat_bow_kwargs():
     return {
-        "width": 64,
-        "embed_size": 2000,
-        "pretrained_vectors": None,
-        "exclusive_classes": False,
+        "exclusive_classes": True,
         "ngram_size": 1,
-        "window_size": 1,
-        "conv_depth": 2,
-        "dropout": None,
-        "nO": 7,
+        "no_output_layer": False,
+        "nO": 34,
     }
 
 
 def get_textcat_cnn_kwargs():
-    return {
-        "tok2vec": test_tok2vec(),
-        "exclusive_classes": False,
-        "nO": 13,
-    }
+    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
 
 
 def get_all_params(model):
@@ -105,7 +96,7 @@ def test_multi_hash_embed():
     "seed,model_func,kwargs",
     [
         (0, build_Tok2Vec_model, get_tok2vec_kwargs()),
-        (0, build_text_classifier, get_textcat_kwargs()),
+        (0, build_bow_text_classifier, get_textcat_bow_kwargs()),
         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()),
     ],
 )
@@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
     "seed,model_func,kwargs,get_X",
     [
         (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
-        (0, build_text_classifier, get_textcat_kwargs(), get_docs),
+        (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
     ],
 )
@@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
     "seed,dropout,model_func,kwargs,get_X",
     [
         (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs),
-        (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs),
+        (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs),
         (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs),
     ],
 )
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 4c1b09849..56b276f0b 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -277,6 +277,62 @@ def test_tag_score(tagged_doc):
     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
 
 
+def test_partial_annotation(en_tokenizer):
+    pred_doc = en_tokenizer("a b c d e")
+    pred_doc[0].tag_ = "A"
+    pred_doc[0].pos_ = "X"
+    pred_doc[0].set_morph("Feat=Val")
+    pred_doc[0].dep_ = "dep"
+
+    # unannotated reference
+    ref_doc = en_tokenizer("a b c d e")
+    ref_doc.has_unknown_spaces = True
+    example = Example(pred_doc, ref_doc)
+    scorer = Scorer()
+    scores = scorer.score([example])
+    for key in scores:
+        # cats doesn't have an unset state
+        if key.startswith("cats"):
+            continue
+        assert scores[key] == None
+
+    # partially annotated reference, not overlapping with predicted annotation
+    ref_doc = en_tokenizer("a b c d e")
+    ref_doc.has_unknown_spaces = True
+    ref_doc[1].tag_ = "A"
+    ref_doc[1].pos_ = "X"
+    ref_doc[1].set_morph("Feat=Val")
+    ref_doc[1].dep_ = "dep"
+    example = Example(pred_doc, ref_doc)
+    scorer = Scorer()
+    scores = scorer.score([example])
+    assert scores["token_acc"] == None
+    assert scores["tag_acc"] == 0.0
+    assert scores["pos_acc"] == 0.0
+    assert scores["morph_acc"] == 0.0
+    assert scores["dep_uas"] == 1.0
+    assert scores["dep_las"] == 0.0
+    assert scores["sents_f"] == None
+
+    # partially annotated reference, overlapping with predicted annotation
+    ref_doc = en_tokenizer("a b c d e")
+    ref_doc.has_unknown_spaces = True
+    ref_doc[0].tag_ = "A"
+    ref_doc[0].pos_ = "X"
+    ref_doc[1].set_morph("Feat=Val")
+    ref_doc[1].dep_ = "dep"
+    example = Example(pred_doc, ref_doc)
+    scorer = Scorer()
+    scores = scorer.score([example])
+    assert scores["token_acc"] == None
+    assert scores["tag_acc"] == 1.0
+    assert scores["pos_acc"] == 1.0
+    assert scores["morph_acc"] == 0.0
+    assert scores["dep_uas"] == 1.0
+    assert scores["dep_las"] == 0.0
+    assert scores["sents_f"] == None
+
+
 def test_roc_auc_score():
     # Binary classification, toy tests from scikit-learn test suite
     y_true = [0, 1]
@@ -334,7 +390,8 @@ def test_roc_auc_score():
     score = ROCAUCScore()
     score.score_set(0.25, 0)
     score.score_set(0.75, 0)
-    assert score.score == -float("inf")
+    with pytest.raises(ValueError):
+        s = score.score
 
     y_true = [1, 1]
     y_score = [0.25, 0.75]
@@ -344,4 +401,5 @@ def test_roc_auc_score():
     score = ROCAUCScore()
     score.score_set(0.25, 1)
     score.score_set(0.75, 1)
-    assert score.score == -float("inf")
+    with pytest.raises(ValueError):
+        s = score.score
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 9d82ca50a..ff2559d2a 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -51,7 +51,7 @@ def test_readers():
     for example in train_corpus(nlp):
         nlp.update([example], sgd=optimizer)
     scores = nlp.evaluate(list(dev_corpus(nlp)))
-    assert scores["cats_score"]
+    assert scores["cats_score"] == 0.0
     # ensure the pipeline runs
     doc = nlp("Quick test")
     assert doc.cats
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 07e1aef01..ba485ab45 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -2,6 +2,7 @@ import numpy
 from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
 from spacy.training import biluo_tags_to_spans, iob_to_biluo
 from spacy.training import Corpus, docs_to_json, Example
+from spacy.training.align import get_alignments
 from spacy.training.converters import json_to_docs
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
@@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
 
-@pytest.mark.skip("Outdated")
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",
     [
-        (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
+        (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
         (
             ["a", "b", '"', "c"],
             ['ab"', "c"],
-            (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
+            ([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
         ),
-        (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})),
+        (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
         (
             ["ab", "c", "d"],
             ["a", "b", "cd"],
-            (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}),
+            ([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
         ),
         (
             ["a", "b", "cd"],
             ["a", "b", "c", "d"],
-            (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
+            ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
         ),
-        ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
+        ([" ", "a"], ["a"], ([[], [0]], [[1]])),
     ],
 )
 def test_align(tokens_a, tokens_b, expected):  # noqa
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa
-    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa
+    a2b, b2a = get_alignments(tokens_a, tokens_b)
+    assert (a2b, b2a) == expected  # noqa
     # check symmetry
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa
-    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa
+    a2b, b2a = get_alignments(tokens_b, tokens_a)  # noqa
+    assert (b2a, a2b) == expected  # noqa
 
 
 def test_goldparse_startswith_space(en_tokenizer):
@@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
     assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
 
 
+def test_goldparse_endswith_space(en_tokenizer):
+    text = "a\n"
+    doc = en_tokenizer(text)
+    gold_words = ["a"]
+    entities = ["U-DATE"]
+    deps = ["ROOT"]
+    heads = [0]
+    example = Example.from_dict(
+        doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
+    )
+    ner_tags = example.get_aligned_ner()
+    assert ner_tags == ["U-DATE", "O"]
+    assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
+
+
 def test_gold_constructor():
     """Test that the Example constructor works fine"""
     nlp = English()
@@ -676,6 +691,87 @@ def test_alignment_different_texts():
         Alignment.from_strings(other_tokens, spacy_tokens)
 
 
+def test_alignment_spaces(en_vocab):
+    # single leading whitespace
+    other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
+
+    # multiple leading whitespace tokens
+    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
+
+    # both with leading whitespace, not identical
+    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
+
+    # same leading whitespace, different tokenization
+    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["  ", "i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
+    assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
+
+    # only one with trailing whitespace
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+
+    # different trailing whitespace
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
+
+    # same trailing whitespace, different tokenization
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", "  "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
+
+    # differing whitespace is allowed
+    other_tokens = ["a", " \n ", "b", "c"]
+    spacy_tokens = ["a", "b", " ", "c"]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.dataXd) == [0, 1, 3]
+    assert list(align.y2x.dataXd) == [0, 2, 3]
+
+    # other differences in whitespace are allowed
+    other_tokens = [" ", "a"]
+    spacy_tokens = ["  ", "a", " "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+
+    other_tokens = ["a", " "]
+    spacy_tokens = ["a", "  "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+
+
 def test_retokenized_docs(doc):
     a = doc.to_array(["TAG"])
     doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index abc82030d..c824b2752 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -399,14 +399,13 @@ cdef class Doc:
             return True
         cdef int i
         cdef int range_start = 0
+        if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
+            attr = SENT_START
         attr = intify_attr(attr)
         # adjust attributes
         if attr == HEAD:
             # HEAD does not have an unset state, so rely on DEP
             attr = DEP
-        elif attr == self.vocab.strings["IS_SENT_START"]:
-            # as in Matcher, allow IS_SENT_START as an alias of SENT_START
-            attr = SENT_START
         # special cases for sentence boundaries
         if attr == SENT_START:
             if "sents" in self.user_hooks:
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 86341dd9a..5111b80dc 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,6 +1,6 @@
 from .corpus import Corpus  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .align import Alignment  # noqa: F401
+from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
new file mode 100644
index 000000000..b9d89f789
--- /dev/null
+++ b/spacy/training/align.pyx
@@ -0,0 +1,66 @@
+from typing import List, Tuple
+from itertools import chain
+import re
+
+from ..errors import Errors
+
+
+def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
+    # Create character-to-token mappings
+    char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
+    char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
+    str_a = "".join(A).lower()
+    str_b = "".join(B).lower()
+    cdef int len_str_a = len(str_a)
+    cdef int len_str_b = len(str_b)
+    # Check that the two texts only differ in whitespace and capitalization
+    if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
+            len_str_a != len(char_to_token_a) or \
+            len_str_b != len(char_to_token_b):
+        raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
+    cdef int char_idx_a = 0
+    cdef int char_idx_b = 0
+    cdef int token_idx_a = 0
+    cdef int token_idx_b = 0
+    cdef int prev_token_idx_a = -1
+    cdef int prev_token_idx_b = -1
+    a2b = []
+    b2a = []
+    while char_idx_a < len_str_a and char_idx_b < len_str_b:
+        # Find the current token position from the character position
+        token_idx_a = char_to_token_a[char_idx_a]
+        token_idx_b = char_to_token_b[char_idx_b]
+        # Add a set for the next token if a token boundary has been crossed
+        if prev_token_idx_a != token_idx_a:
+            a2b.append(set())
+        if prev_token_idx_b != token_idx_b:
+            b2a.append(set())
+        # Process the alignment at the current position
+        if A[token_idx_a] == B[token_idx_b]:
+            # Current tokens are identical
+            a2b[-1].add(token_idx_b)
+            b2a[-1].add(token_idx_a)
+            char_idx_a += len(A[token_idx_a])
+            char_idx_b += len(B[token_idx_b])
+        elif str_a[char_idx_a] == str_b[char_idx_b]:
+            # Current chars are identical
+            a2b[-1].add(token_idx_b)
+            b2a[-1].add(token_idx_a)
+            char_idx_a += 1
+            char_idx_b += 1
+        elif str_a[char_idx_a].isspace():
+            # Skip unaligned whitespace char in A
+            char_idx_a += 1
+        elif str_b[char_idx_b].isspace():
+            # Skip unaligned whitespace char in B
+            char_idx_b += 1
+        else:
+            # This should never happen
+            raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
+        prev_token_idx_a = token_idx_a
+        prev_token_idx_b = token_idx_b
+    # Process unaligned trailing whitespace
+    a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
+    b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
+    # Return values as sorted lists per token position
+    return [sorted(x) for x in a2b], [sorted(x) for x in b2a]
diff --git a/spacy/training/align.py b/spacy/training/alignment.py
similarity index 75%
rename from spacy/training/align.py
rename to spacy/training/alignment.py
index e8f17a667..3e3b60ca6 100644
--- a/spacy/training/align.py
+++ b/spacy/training/alignment.py
@@ -2,9 +2,8 @@ from typing import List
 import numpy
 from thinc.types import Ragged
 from dataclasses import dataclass
-import tokenizations
 
-from ..errors import Errors
+from .align import get_alignments
 
 
 @dataclass
@@ -20,9 +19,7 @@ class Alignment:
 
     @classmethod
     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
-        if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
-            raise ValueError(Errors.E949)
-        x2y, y2x = tokenizations.get_alignments(A, B)
+        x2y, y2x = get_alignments(A, B)
         return Alignment.from_indices(x2y=x2y, y2x=y2x)
 
 
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index a8da49c61..6a556b5e7 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
-from .align import Alignment
+from .alignment import Alignment
 from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
 from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 7c84caf95..3d79eb78f 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     # Resolve all training-relevant sections using the filled nlp config
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
+    if not isinstance(T["train_corpus"], str):
+        raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"])))
+    if not isinstance(T["dev_corpus"], str):
+        raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"])))
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     optimizer = T["optimizer"]
     # Components that shouldn't be updated during training
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index b91fb07a8..e5c41c70b 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 from ..errors import Errors
-from ..util import registry, load_model_from_config, dot_to_object
+from ..util import registry, load_model_from_config, resolve_dot_names
 
 
 def pretrain(
@@ -38,7 +38,7 @@ def pretrain(
     _config = nlp.config.interpolate()
     T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
     P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
-    corpus = dot_to_object(T, P["corpus"])
+    corpus = resolve_dot_names(_config, [P["corpus"]])[0]
     batcher = P["batcher"]
     model = create_pretraining_model(nlp, P)
     optimizer = P["optimizer"]
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 3157c261a..fe2223017 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build a mixed representations. The features used
+a feed-forward subnetwork to build a mixed representation. The features used
 can be configured with the `attrs` argument. The suggested attributes are
 `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
 some subword information, without construction a fully character-based
@@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with
 different architectures and settings to determine what works best on your
 specific data and challenge.
 
-### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
+### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TextCatEnsemble.v1"
-> exclusive_classes = false
-> pretrained_vectors = null
-> width = 64
-> embed_size = 2000
-> conv_depth = 2
-> window_size = 1
-> ngram_size = 1
-> dropout = null
+> @architectures = "spacy.TextCatEnsemble.v2"
 > nO = null
+>
+> [model.linear_model]
+> @architectures = "spacy.TextCatBOW.v1"
+> exclusive_classes = true
+> ngram_size = 1
+> no_output_layer = false
+>
+> [model.tok2vec]
+> @architectures = "spacy.Tok2Vec.v1"
+>
+> [model.tok2vec.embed]
+> @architectures = "spacy.MultiHashEmbed.v1"
+> width = 64
+> rows = [2000, 2000, 1000, 1000, 1000, 1000]
+> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+> include_static_vectors = false
+>
+> [model.tok2vec.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> width = ${model.tok2vec.embed.width}
+> window_size = 1
+> maxout_pieces = 3
+> depth = 2
 > ```
 
-Stacked ensemble of a bag-of-words model and a neural network model. The neural
-network has an internal CNN Tok2Vec layer and uses attention.
+Stacked ensemble of a linear bag-of-words model and a neural network model. The
+neural network is built upon a Tok2Vec layer and uses attention. The setting for
+whether or not this model should cater for multi-label classification, is taken
+from the linear model, where it is stored in `model.attrs["multi_label"]`.
+
+| Name           | Description                                                                                                                                                                                    |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~                                                                                                                                  |
+| `tok2vec`      | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
+| `nO`           | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**    | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
+
+The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
 
 | Name                 | Description                                                                                                                                                                                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
+</Accordion>
+
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
 
 > #### Example Config
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index d511dc889..16bbc2700 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -683,6 +683,7 @@ The L2 norm of the document's vector representation.
 | `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
 | `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
 | `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
+| `has_unknown_spaces`                 | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~         |
 | `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |
 
 ## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 0dbc0de33..fb48d68cc 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -68,6 +68,8 @@ Scores the tokenization:
 - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token
   character spans
 
+Docs with `has_unknown_spaces` are skipped during scoring.
+
 > #### Example
 >
 > ```python
@@ -81,7 +83,8 @@ Scores the tokenization:
 
 ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
 
-Scores a single token attribute.
+Scores a single token attribute. Tokens with missing values in the reference doc
+are skipped during scoring.
 
 > #### Example
 >
@@ -90,20 +93,22 @@ Scores a single token attribute.
 > print(scores["pos_acc"])
 > ```
 
-| Name           | Description                                                                                                                                                   |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`         | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                               |
-| `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| **RETURNS**    | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          |
+| Name             | Description                                                                                                                                                   |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
+| _keyword-only_   |                                                                                                                                                               |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
+| **RETURNS**      | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          |
 
 ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
 
 Scores a single token attribute per feature for a token attribute in the
 Universal Dependencies
 [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
-format.
+format. Tokens with missing values in the reference doc are skipped during
+scoring.
 
 > #### Example
 >
@@ -112,13 +117,14 @@ format.
 > print(scores["morph_per_feat"])
 > ```
 
-| Name           | Description                                                                                                                                                   |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`         | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                               |
-| `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| **RETURNS**    | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           |
+| Name             | Description                                                                                                                                                   |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
+| _keyword-only_   |                                                                                                                                                               |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
+| **RETURNS**      | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           |
 
 ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
 
@@ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans.
 > print(scores["ents_f"])
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         |
-| `attr`         | The attribute to score. ~~str~~                                                                                                                                                             |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `getter`       | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  |
-| **RETURNS**    | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name             | Description                                                                                                                                                                                 |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                                             |
+| _keyword-only_   |                                                                                                                                                                                             |
+| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  |
+| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~      |
+| **RETURNS**      | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
 ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
 
-Calculate the UAS, LAS, and LAS per type scores for dependency parses.
+Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens
+with missing values for the `attr` (typically `dep`) are skipped during scoring.
 
 > #### Example
 >
@@ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
 > print(scores["dep_uas"], scores["dep_las"])
 > ```
 
-| Name            | Description                                                                                                                                                   |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`          | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_  |                                                                                                                                                               |
-| `getter`        | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| `head_attr`     | The attribute containing the head token. ~~str~~                                                                                                              |
-| `head_getter`   | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              |
-| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            |
-| **RETURNS**     | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      |
+| Name             | Description                                                                                                                                                   |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
+| _keyword-only_   |                                                                                                                                                               |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
+| `head_attr`      | The attribute containing the head token. ~~str~~                                                                                                              |
+| `head_getter`    | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              |
+| `ignore_labels`  | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
+| **RETURNS**      | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      |
 
 ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
 
 Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict
-containing scores for each label like `Doc.cats`. The reported overall score
-depends on the scorer settings:
+containing scores for each label like `Doc.cats`. The returned dictionary
+contains the following scores:
 
-1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` /
-   `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall
-   score), `{attr}_f_per_type`, `{attr}_auc_per_type`
-2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f`
-3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`;
-4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc`
+- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across
+  each label is weighted equally
+- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values
+  across evaluations per label
+- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of
+  scores, keyed by label
+- A final `{attr}_score` and corresponding `{attr}_score_desc` (text
+  description)
+
+The reported `{attr}_score` depends on the classification properties:
+
+- **binary exclusive with positive label:** `{attr}_score` is set to the F-score
+  of the positive label
+- **3+ exclusive classes**, macro-averaged F-score:
+  `{attr}_score = {attr}_macro_f`
+- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc`
 
 > #### Example
 >
diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md
index 2a8733f41..f05fedd7d 100644
--- a/website/docs/usage/101/_vectors-similarity.md
+++ b/website/docs/usage/101/_vectors-similarity.md
@@ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers))
 
 Computing similarity scores can be helpful in many situations, but it's also
 important to maintain **realistic expectations** about what information it can
-provide. Words can be related to each over in many ways, so a single
+provide. Words can be related to each other in many ways, so a single
 "similarity" score will always be a **mix of different signals**, and vectors
 trained on different data can produce very different results that may not be
 useful for your purpose. Here are some important considerations to keep in mind:
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index d7b2593e7..aa62a77d4 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -130,16 +130,31 @@ factory = "textcat"
 labels = []
 
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
-exclusive_classes = false
-pretrained_vectors = null
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
-ngram_size = 1
-dropout = 0
+@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.textcat.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 64
+rows = [2000, 2000, 1000, 1000, 1000, 1000]
+attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+include_static_vectors = false
+
+[components.textcat.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${components.textcat.model.tok2vec.embed.width}
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
 ```
 
 spaCy has two additional built-in `textcat` architectures, and you can easily
@@ -687,7 +702,7 @@ Before the model can be used, it needs to be
 [initialized](/usage/training#initialization). This function receives a callback
 to access the full **training data set**, or a representative sample. This data
 set can be used to deduce all **relevant labels**. Alternatively, a list of
-labels can be provided to `initialize`, or you can call 
+labels can be provided to `initialize`, or you can call
 `RelationExtractor.add_label` directly. The number of labels defines the output
 dimensionality of the network, and will be used to do
 [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index a0cf36909..ef44009ae 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1244,15 +1244,10 @@ labels = []
 # This function is created and then passed to the "textcat" component as
 # the argument "model"
 [components.textcat.model]
-@architectures = "spacy.TextCatEnsemble.v1"
+@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
-pretrained_vectors = null
-width = 64
-conv_depth = 2
-embed_size = 2000
-window_size = 1
 ngram_size = 1
-dropout = null
+no_output_layer = false
 
 [components.other_textcat]
 factory = "textcat"
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 131bd8c94..44d0fd388 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -1142,7 +1142,7 @@ pattern = [
     {
         "LEFT_ID": "anchor_founded",
         "REL_OP": ">",
-        "RIGHT_ID": "subject",
+        "RIGHT_ID": "founded_subject",
         "RIGHT_ATTRS": {"DEP": "nsubj"},
     }
     # ...
@@ -1212,7 +1212,7 @@ pattern = [
     {
         "LEFT_ID": "anchor_founded",
         "REL_OP": ">",
-        "RIGHT_ID": "subject",
+        "RIGHT_ID": "founded_subject",
         "RIGHT_ATTRS": {"DEP": "nsubj"},
     },
     {
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 5a42d2172..274ea5989 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -717,7 +717,7 @@ tabular results to a file:
 ```python
 ### functions.py
 import sys
-from typing import IO, Tuple, Callable, Dict, Any
+from typing import IO, Tuple, Callable, Dict, Any, Optional
 import spacy
 from spacy import Language
 from pathlib import Path
@@ -729,7 +729,7 @@ def custom_logger(log_path):
         stdout: IO=sys.stdout,
         stderr: IO=sys.stderr
     ) -> Tuple[Callable, Callable]:
-        stdout.write(f"Logging to {log_path}\n")
+        stdout.write(f"Logging to {log_path}\\n")
         log_file = Path(log_path).open("w", encoding="utf8")
         log_file.write("step\\t")
         log_file.write("score\\t")
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index fe4765285..b25b28a6d 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | Name                                                                                                                            | Description                                                                                                                                                                                      |
 | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        |
-| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         |
+| [`Token.morph`](/api/token#attributes)                                                                                          | Access a token's morphological analysis.                                                                                                                                                         |
 | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         |
 | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              |
 | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             |
 | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          |
 | [`Language.resume_training`](/api/language#resume_training)                                                                     | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting.                              |
 | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component)                                  | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions.                                               |
-| [`Language.has_factory`](/api/language#has_factory)                                                                             | Check whether a component factory is registered on a language class.                                                                                                                            |
+| [`Language.has_factory`](/api/language#has_factory)                                                                             | Check whether a component factory is registered on a language class.                                                                                                                             |
 | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta)       | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name.                                                                                       |
 | [`Language.config`](/api/language#config)                                                                                       | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
 | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes)                       | All available components and component names, including disabled components that are not run as part of the pipeline.                                                                            |
@@ -1032,9 +1032,9 @@ change your names and imports:
 Thanks to everyone who's been contributing to the spaCy ecosystem by developing
 and maintaining one of the many awesome [plugins and extensions](/universe).
 We've tried to make it as easy as possible for you to upgrade your packages for
-spaCy v3.0. The most common use case for plugins is providing pipeline components
-and extension attributes. When migrating your plugin, double-check the
-following:
+spaCy v3.0. The most common use case for plugins is providing pipeline
+components and extension attributes. When migrating your plugin, double-check
+the following:
 
 - Use the [`@Language.factory`](/api/language#factory) decorator to register
   your component and assign it a name. This allows users to refer to your
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index 73b2d072d..cc73e7e67 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg)
 Since each visualization is generated as a separate SVG, exporting `.svg` files
 only works if you're rendering **one single doc** at a time. (This makes sense –
 after all, each visualization should be a standalone graphic.) So instead of
-rendering all `Doc`s at one, loop over them and export them separately.
+rendering all `Doc`s at once, loop over them and export them separately.
 
 </Infobox>
 
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index b9658dacd..17140b072 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -120,7 +120,7 @@ function formatAccuracy(data) {
                 ? null
                 : {
                       label,
-                      value: value.toFixed(2),
+                      value: (value * 100).toFixed(2),
                       help: MODEL_META[label],
                   }
         })