Set version to v4.0.0.dev0 (#12126 )

Fix batching regression (#12094 )
* Fix batching regression Some time ago, the spaCy v4 branch switched to the new Thinc v9 schedule. However, this introduced an error in how batching is handed. In the PR, the batchers were changed to keep track of their step, so that the step can be passed to the schedule. However, the issue is that the training loop repeatedly calls the batching functions (rather than using an infinite generator/iterator). So, the step and therefore the schedule would be reset each epoch. Before the schedule switch we didn't have this issue, because the old schedules were stateful. This PR fixes this issue by reverting the batching functions to use a (stateful) generator. Their registry functions do accept a `Schedule` and we convert `Schedule`s to generators. * Update batcher docs * Docstring fixes * Make minibatch take iterables again as well * Bump thinc requirement to 9.0.0.dev2 * Use type declaration * Convert another comment into a proper type declaration
2025-08-06 05:10:21 +03:00 · 2023-01-19 09:25:34 +01:00 · 2023-01-18 18:28:30 +01:00 · 2023-01-18 11:27:45 +01:00 · 2023-01-16 10:25:53 +01:00 · 2023-01-13 11:14:58 +01:00
175 changed files with 5506 additions and 4048 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.1.0,<8.2.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.1.0,<8.2.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
--- a/setup.cfg
+++ b/setup.cfg
@ -32,14 +32,6 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.6
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.1.0,<8.2.0
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.11,<3.1.0
@ -47,7 +39,7 @@ install_requires =
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.1.0,<8.2.0
+    thinc>=9.0.0.dev2,<9.1.0
    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
@ -120,7 +112,7 @@ ja =
    sudachipy>=0.5.2,!=0.6.1
    sudachidict_core>=20211220
 ko =
-    natto-py>=0.9.0
+    mecab-ko>=1.0.0
 th =
    pythainlp>=2.0

--- a/setup.py
+++ b/setup.py
@ -33,13 +33,10 @@ MOD_NAMES = [
    "spacy.kb.candidate",
    "spacy.kb.kb",
    "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
    "spacy.morphology",
-    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
-    "spacy.pipeline.multitask",
-    "spacy.pipeline.ner",
    "spacy.pipeline.pipe",
    "spacy.pipeline.trainable_pipe",
    "spacy.pipeline.sentencizer",
@ -47,12 +44,15 @@ MOD_NAMES = [
    "spacy.pipeline.tagger",
    "spacy.pipeline.transition_parser",
    "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
    "spacy.pipeline._parser_internals.ner",
    "spacy.pipeline._parser_internals.nonproj",
+    "spacy.pipeline._parser_internals.search",
    "spacy.pipeline._parser_internals._state",
    "spacy.pipeline._parser_internals.stateclass",
    "spacy.pipeline._parser_internals.transition_system",
    "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
    "spacy.tokenizer",
    "spacy.training.align",
    "spacy.training.gold_io",
@ -62,12 +62,13 @@ MOD_NAMES = [
    "spacy.tokens.span_group",
    "spacy.tokens.graph",
    "spacy.tokens.morphanalysis",
-    "spacy.tokens._retokenize",
+    "spacy.tokens.retokenizer",
    "spacy.matcher.matcher",
    "spacy.matcher.phrasematcher",
    "spacy.matcher.dependencymatcher",
    "spacy.symbols",
    "spacy.vectors",
+    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
    "msvc": ["/Ox", "/EHsc"],
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "4.0.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,98 +1,49 @@
-# Reserve 64 values for flag features
 from . cimport symbols

 cdef enum attr_id_t:
-    NULL_ATTR
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
-    LIKE_URL
-    LIKE_NUM
-    LIKE_EMAIL
-    IS_STOP
-    IS_OOV_DEPRECATED
-    IS_BRACKET
-    IS_QUOTE
-    IS_LEFT_PUNCT
-    IS_RIGHT_PUNCT
-    IS_CURRENCY
+    NULL_ATTR = 0
+    IS_ALPHA = symbols.IS_ALPHA
+    IS_ASCII = symbols.IS_ASCII
+    IS_DIGIT = symbols.IS_DIGIT
+    IS_LOWER = symbols.IS_LOWER
+    IS_PUNCT = symbols.IS_PUNCT
+    IS_SPACE = symbols.IS_SPACE
+    IS_TITLE = symbols.IS_TITLE
+    IS_UPPER = symbols.IS_UPPER
+    LIKE_URL = symbols.LIKE_URL
+    LIKE_NUM = symbols.LIKE_NUM
+    LIKE_EMAIL = symbols.LIKE_EMAIL
+    IS_STOP = symbols.IS_STOP
+    IS_BRACKET = symbols.IS_BRACKET
+    IS_QUOTE = symbols.IS_QUOTE
+    IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
+    IS_CURRENCY = symbols.IS_CURRENCY

-    FLAG19 = 19
-    FLAG20
-    FLAG21
-    FLAG22
-    FLAG23
-    FLAG24
-    FLAG25
-    FLAG26
-    FLAG27
-    FLAG28
-    FLAG29
-    FLAG30
-    FLAG31
-    FLAG32
-    FLAG33
-    FLAG34
-    FLAG35
-    FLAG36
-    FLAG37
-    FLAG38
-    FLAG39
-    FLAG40
-    FLAG41
-    FLAG42
-    FLAG43
-    FLAG44
-    FLAG45
-    FLAG46
-    FLAG47
-    FLAG48
-    FLAG49
-    FLAG50
-    FLAG51
-    FLAG52
-    FLAG53
-    FLAG54
-    FLAG55
-    FLAG56
-    FLAG57
-    FLAG58
-    FLAG59
-    FLAG60
-    FLAG61
-    FLAG62
-    FLAG63
+    ID = symbols.ID
+    ORTH = symbols.ORTH
+    LOWER = symbols.LOWER
+    NORM = symbols.NORM
+    SHAPE = symbols.SHAPE
+    PREFIX = symbols.PREFIX
+    SUFFIX = symbols.SUFFIX

-    ID
-    ORTH
-    LOWER
-    NORM
-    SHAPE
-    PREFIX
-    SUFFIX
+    LENGTH = symbols.LENGTH
+    CLUSTER = symbols.CLUSTER
+    LEMMA = symbols.LEMMA
+    POS = symbols.POS
+    TAG = symbols.TAG
+    DEP = symbols.DEP
+    ENT_IOB = symbols.ENT_IOB
+    ENT_TYPE = symbols.ENT_TYPE
+    HEAD = symbols.HEAD
+    SENT_START = symbols.SENT_START
+    SPACY = symbols.SPACY
+    PROB = symbols.PROB

-    LENGTH
-    CLUSTER
-    LEMMA
-    POS
-    TAG
-    DEP
-    ENT_IOB
-    ENT_TYPE
-    HEAD
-    SENT_START
-    SPACY
-    PROB
-
-    LANG
+    LANG = symbols.LANG
    ENT_KB_ID = symbols.ENT_KB_ID
-    MORPH
+    MORPH = symbols.MORPH
    ENT_ID = symbols.ENT_ID

-    IDX
-    SENT_END
+    IDX = symbols.IDX
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -16,57 +16,11 @@ IDS = {
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
-    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "IS_CURRENCY": IS_CURRENCY,
-    "FLAG19": FLAG19,
-    "FLAG20": FLAG20,
-    "FLAG21": FLAG21,
-    "FLAG22": FLAG22,
-    "FLAG23": FLAG23,
-    "FLAG24": FLAG24,
-    "FLAG25": FLAG25,
-    "FLAG26": FLAG26,
-    "FLAG27": FLAG27,
-    "FLAG28": FLAG28,
-    "FLAG29": FLAG29,
-    "FLAG30": FLAG30,
-    "FLAG31": FLAG31,
-    "FLAG32": FLAG32,
-    "FLAG33": FLAG33,
-    "FLAG34": FLAG34,
-    "FLAG35": FLAG35,
-    "FLAG36": FLAG36,
-    "FLAG37": FLAG37,
-    "FLAG38": FLAG38,
-    "FLAG39": FLAG39,
-    "FLAG40": FLAG40,
-    "FLAG41": FLAG41,
-    "FLAG42": FLAG42,
-    "FLAG43": FLAG43,
-    "FLAG44": FLAG44,
-    "FLAG45": FLAG45,
-    "FLAG46": FLAG46,
-    "FLAG47": FLAG47,
-    "FLAG48": FLAG48,
-    "FLAG49": FLAG49,
-    "FLAG50": FLAG50,
-    "FLAG51": FLAG51,
-    "FLAG52": FLAG52,
-    "FLAG53": FLAG53,
-    "FLAG54": FLAG54,
-    "FLAG55": FLAG55,
-    "FLAG56": FLAG56,
-    "FLAG57": FLAG57,
-    "FLAG58": FLAG58,
-    "FLAG59": FLAG59,
-    "FLAG60": FLAG60,
-    "FLAG61": FLAG61,
-    "FLAG62": FLAG62,
-    "FLAG63": FLAG63,
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
@ -92,12 +46,11 @@ IDS = {
 }


-# ATTR IDs, in order of the symbol
-NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+NAMES = {v: k for k, v in IDS.items()}
 locals().update(IDS)


-def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+def intify_attrs(stringy_attrs, strings_map=None):
    """
    Normalize a dictionary of attributes, converting them to ints.

@ -109,75 +62,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        converted to ints.
    """
    inty_attrs = {}
-    if _do_deprecated:
-        if "F" in stringy_attrs:
-            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if "L" in stringy_attrs:
-            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if "pos" in stringy_attrs:
-            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")
-        if "number" in stringy_attrs:
-            stringy_attrs.pop("number")
-        if "tenspect" in stringy_attrs:
-            stringy_attrs.pop("tenspect")
-        morph_keys = [
-            "PunctType",
-            "PunctSide",
-            "Other",
-            "Degree",
-            "AdvType",
-            "Number",
-            "VerbForm",
-            "PronType",
-            "Aspect",
-            "Tense",
-            "PartType",
-            "Poss",
-            "Hyph",
-            "ConjType",
-            "NumType",
-            "Foreign",
-            "VerbType",
-            "NounType",
-            "Gender",
-            "Mood",
-            "Negative",
-            "Tense",
-            "Voice",
-            "Abbr",
-            "Derivation",
-            "Echo",
-            "Foreign",
-            "NameType",
-            "NounType",
-            "NumForm",
-            "NumValue",
-            "PartType",
-            "Polite",
-            "StyleVariant",
-            "PronType",
-            "AdjType",
-            "Person",
-            "Variant",
-            "AdpType",
-            "Reflex",
-            "Negative",
-            "Mood",
-            "Aspect",
-            "Case",
-            "Polarity",
-            "PrepCase",
-            "Animacy",  # U20
-        ]
-        for key in morph_keys:
-            if key in stringy_attrs:
-                stringy_attrs.pop(key)
-            elif key.lower() in stringy_attrs:
-                stringy_attrs.pop(key.lower())
-            elif key.upper() in stringy_attrs:
-                stringy_attrs.pop(key.upper())
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,7 +8,6 @@ from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
 from .. import about
 from ..util import is_package, get_minor_version, run_command
 from ..util import is_prerelease_version
-from ..errors import OLD_MODEL_SHORTCUTS


@app.command(
@ -61,12 +60,6 @@ def download(
        version = components[-1]
    else:
        model_name = model
-        if model in OLD_MODEL_SHORTCUTS:
-            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
-                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
-            )
-            model_name = OLD_MODEL_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)

--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -87,12 +87,11 @@ grad_factor = 1.0
 factory = "parser"

 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null

 [components.parser.model.tok2vec]
@ -108,12 +107,11 @@ grad_factor = 1.0
 factory = "ner"

 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null

 [components.ner.model.tok2vec]
@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"

 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null

 [components.parser.model.tok2vec]
@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"

 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null

 [components.ner.model.tok2vec]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -131,13 +131,6 @@ class Warnings(metaclass=ErrorsWithCodes):
            "and make it independent. For example, `replace_listeners = "
            "[\"model.tok2vec\"]` See the documentation for details: "
            "https://spacy.io/usage/training#config-components-listeners")
-    W088 = ("The pipeline component {name} implements a `begin_training` "
-            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
-            "has been renamed to `initialize`, so you likely want to rename the "
-            "component method. See the documentation for details: "
-            "https://spacy.io/api/language#initialize")
-    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
-            "to `nlp.initialize`.")
    W090 = ("Could not locate any {format} files in path '{path}'.")
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -216,6 +209,8 @@ class Warnings(metaclass=ErrorsWithCodes):
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")

+    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+

 class Errors(metaclass=ErrorsWithCodes):
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@ -251,9 +246,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "https://spacy.io/usage/models")
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
-    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
    E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
            "refers to an issue with the `Vocab` or `StringStore`.")
    E019 = ("Can't create transition with unknown action ID: {action}. Action "
@ -466,13 +459,13 @@ class Errors(metaclass=ErrorsWithCodes):
            "same, but found '{nlp}' and '{vocab}' respectively.")
    E152 = ("The attribute {attr} is not supported for token patterns. "
            "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
    E153 = ("The value type {vtype} is not supported for token patterns. "
            "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
    E154 = ("One of the attributes or values is not supported for token "
            "patterns. Please use the option `validate=True` with the Matcher, "
-            "PhraseMatcher, or EntityRuler for more details.")
+            "PhraseMatcher, or SpanRuler for more details.")
    E155 = ("The pipeline needs to include a {pipe} in order to use "
            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@ -496,7 +489,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "Current DocBin: {current}\nOther DocBin: {other}")
    E169 = ("Can't find module: {module}")
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
            "callable or None, but got: {arg_type}")
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
@ -733,13 +726,6 @@ class Errors(metaclass=ErrorsWithCodes):
            "method in component '{name}'. If you want to use this "
            "method, make sure it's overwritten on the subclass.")
    E940 = ("Found NaN values in scores.")
-    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
-            "load the model, use its full name instead:\n\n"
-            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models. If you "
-            "want to create a blank model, use spacy.blank: "
-            "nlp = spacy.blank(\"{name}\")")
    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
            "return an initialized nlp object but got: {value}. Maybe "
            "you forgot to return the modified object in your function?")
@ -753,7 +739,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "loaded nlp object, but got: {source}")
    E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
            "a string value from {expected} but got: '{arg}'")
-    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
            "a list, but got: {arg_type}")
    E949 = ("Unable to align tokens for the predicted and reference docs. It "
            "is only possible to align the docs when both texts are the same "
@ -927,8 +913,6 @@ class Errors(metaclass=ErrorsWithCodes):
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
-             "exist.")
    E1024 = ("A pattern with {attr_type} '{label}' is not present in "
             "'{component}' patterns.")
    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
@ -969,14 +953,14 @@ class Errors(metaclass=ErrorsWithCodes):
    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
             "or use `auto_switch_port=True` to pick an available port automatically.")

-
-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}
+    # v4 error strings
+    E4000 = ("Expected a Doc as input, but got: '{type}'")
+    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
+             "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")


 # fmt: on
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -18,34 +18,23 @@ DEFAULT_CONFIG = """

 [nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
+mecab_args = ""
 """


@registry.tokenizers("spacy.ko.KoreanTokenizer")
-def create_tokenizer():
+def create_tokenizer(mecab_args: str):
    def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp.vocab)
+        return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)

    return korean_tokenizer_factory


 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, vocab: Vocab):
+    def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
        self.vocab = vocab
-        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
-        self._mecab_tokenizer = None
-
-    @property
-    def mecab_tokenizer(self):
-        # This is a property so that initializing a pipeline with blank:ko is
-        # possible without actually requiring mecab-ko, e.g. to run
-        # `spacy init vectors ko` for a pipeline that will have a different
-        # tokenizer in the end. The languages need to match for the vectors
-        # to be imported and there's no way to pass a custom config to
-        # `init vectors`.
-        if self._mecab_tokenizer is None:
-            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
-        return self._mecab_tokenizer
+        mecab = try_mecab_import()
+        self.mecab_tokenizer = mecab.Tagger(mecab_args)

    def __reduce__(self):
        return KoreanTokenizer, (self.vocab,)
@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer):
    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
-        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
-            if node.is_eos():
+        for line in self.mecab_tokenizer.parse(text).split("\n"):
+            if line == "EOS":
                break
-            surface = node.surface
-            feature = node.feature
-            tag, _, expr = feature.partition(",")
-            lemma, _, remainder = expr.partition("/")
+            surface, _, expr = line.partition("\t")
+            features = expr.split("/")[0].split(",")
+            tag = features[0]
+            lemma = "*"
+            if len(features) >= 8:
+                lemma = features[7]
            if lemma == "*":
                lemma = surface
            yield {"surface": surface, "lemma": lemma, "tag": tag}
@ -97,20 +88,94 @@ class Korean(Language):
    Defaults = KoreanDefaults


-def try_mecab_import() -> None:
+def try_mecab_import():
    try:
-        from natto import MeCab
+        import mecab_ko as MeCab

        return MeCab
    except ImportError:
        raise ImportError(
            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
-            "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
-            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
-            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            "the python package `mecab-ko`: pip install mecab-ko"
        ) from None


+@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
+def create_natto_tokenizer():
+    def korean_natto_tokenizer_factory(nlp):
+        return KoreanNattoTokenizer(nlp.vocab)
+
+    return korean_natto_tokenizer_factory
+
+
+class KoreanNattoTokenizer(DummyTokenizer):
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
+        self._mecab = self._try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab_tokenizer = None
+
+    @property
+    def mecab_tokenizer(self):
+        # This is a property so that initializing a pipeline with blank:ko is
+        # possible without actually requiring mecab-ko, e.g. to run
+        # `spacy init vectors ko` for a pipeline that will have a different
+        # tokenizer in the end. The languages need to match for the vectors
+        # to be imported and there's no way to pass a custom config to
+        # `init vectors`.
+        if self._mecab_tokenizer is None:
+            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
+        return self._mecab_tokenizer
+
+    def __reduce__(self):
+        return KoreanNattoTokenizer, (self.vocab,)
+
+    def __call__(self, text: str) -> Doc:
+        dtokens = list(self.detailed_tokens(text))
+        surfaces = [dt["surface"] for dt in dtokens]
+        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
+        for token, dtoken in zip(doc, dtokens):
+            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
+            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
+            token.lemma_ = dtoken["lemma"]
+        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
+        return doc
+
+    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
+        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],      종성 유무(jongseong)[2], 읽기(reading)[3],
+        # 타입(type)[4], 첫번째 품사(start pos)[5],     마지막 품사(end pos)[6], 표현(expression)[7], *
+        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
+            if node.is_eos():
+                break
+            surface = node.surface
+            feature = node.feature
+            tag, _, expr = feature.partition(",")
+            lemma, _, remainder = expr.partition("/")
+            if lemma == "*" or lemma == "":
+                lemma = surface
+            yield {"surface": surface, "lemma": lemma, "tag": tag}
+
+    def score(self, examples):
+        validate_examples(examples, "KoreanTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
+    def _try_mecab_import(self):
+        try:
+            from natto import MeCab
+
+            return MeCab
+        except ImportError:
+            raise ImportError(
+                'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
+                "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+                "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+                "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            ) from None
+
+
 def check_spaces(text, tokens):
    prev_end = -1
    start = 0
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -17,10 +17,6 @@ URL_PATTERN = (
    r"(?:\S+(?::\S*)?@)?"
    r"(?:"
    # IP address exclusion
-    # private & local networks
-    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
-    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
-    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1239,15 +1239,6 @@ class Language:
            sgd(key, W, dW)  # type: ignore[call-arg, misc]
        return losses

-    def begin_training(
-        self,
-        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
-        *,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
-        warnings.warn(Warnings.W089, DeprecationWarning)
-        return self.initialize(get_examples, sgd=sgd)
-
    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -5,7 +5,6 @@ from .attrs cimport attr_id_t
 from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG

 from .structs cimport LexemeC
-from .strings cimport StringStore
 from .vocab cimport Vocab


--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@ -20,7 +20,6 @@ class Lexeme:
    def vector_norm(self) -> float: ...
    vector: Floats1d
    rank: int
-    sentiment: float
    @property
    def orth_(self) -> str: ...
    @property
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -173,19 +173,6 @@ cdef class Lexeme:
        def __set__(self, value):
            self.c.id = value

-    property sentiment:
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the lexeme."""
-        def __get__(self):
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
-            return sentiment_table.get(self.c.orth, 0.0)
-
-        def __set__(self, float x):
-            if "lexeme_sentiment" not in self.vocab.lookups:
-                self.vocab.lookups.add_table("lexeme_sentiment")
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
-            sentiment_table[self.c.orth] = x
-
    @property
    def orth_(self):
        """RETURNS (str): The original verbatim text of the lexeme
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -165,9 +165,9 @@ cdef class DependencyMatcher:
        on_match (callable): Optional callback executed on match.
        """
        if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -23,7 +23,7 @@ from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 from .levenshtein import levenshtein_compare
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
-from ..strings import get_string_id
+from ..strings cimport get_string_id
 from ..attrs import IDS
 from ..util import registry

@ -115,9 +115,9 @@ cdef class Matcher:
        """
        errors = {}
        if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
        if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
            raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
        for i, pattern in enumerate(patterns):
@ -265,6 +265,10 @@ cdef class Matcher:
        # non-overlapping ones this `match` can be either (start, end) or
        # (start, end, alignments) depending on `with_alignments=` option.
        for key, *match in matches:
+            # Adjust span matches to doc offsets
+            if isinstance(doclike, Span):
+                match[0] += doclike.start
+                match[1] += doclike.start
            span_filter = self._filter.get(key)
            if span_filter is not None:
                pairs = pairs_by_id.get(key, [])
@ -295,9 +299,6 @@ cdef class Matcher:
        if as_spans:
            final_results = []
            for key, start, end, *_ in final_matches:
-                if isinstance(doclike, Span):
-                    start += doclike.start
-                    end += doclike.start
                final_results.append(Span(doc, start, end, label=key))
        elif with_alignments:
            # convert alignments List[Dict[str, int]] --> List[int]
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@ -20,6 +20,15 @@ class PhraseMatcher:
            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
        ] = ...,
    ) -> None: ...
+    def _add_from_arrays(
+        self,
+        key: str,
+        specs: List[List[int]],
+        *,
+        on_match: Optional[
+            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+        ] = ...,
+    ) -> None: ...
    def remove(self, key: str) -> None: ...
    @overload
    def __call__(
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -1,4 +1,6 @@
 # cython: infer_types=True, profile=True
+from typing import List
+from collections import defaultdict
 from libc.stdint cimport uintptr_t
 from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter

@ -39,7 +41,7 @@ cdef class PhraseMatcher:
        """
        self.vocab = vocab
        self._callbacks = {}
-        self._docs = {}
+        self._docs = defaultdict(set)
        self._validate = validate

        self.mem = Pool()
@ -155,66 +157,24 @@ cdef class PhraseMatcher:
        del self._callbacks[key]
        del self._docs[key]

-    def add(self, key, docs, *_docs, on_match=None):
-        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
-        key, an on_match callback, and one or more patterns.

-        Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
-        second argument, with the on_match callback as an optional keyword
-        argument.
+    def _add_from_arrays(self, key, specs, *, on_match=None):
+        """Add a preprocessed list of specs, with an optional callback.

        key (str): The match ID.
-        docs (list): List of `Doc` objects representing match patterns.
+        specs (List[List[int]]): A list of lists of hashes to match.
        on_match (callable): Callback executed on match.
-        *_docs (Doc): For backwards compatibility: list of patterns to add
-            as variable arguments. Will be ignored if a list of patterns is
-            provided as the second argument.
-
-        DOCS: https://spacy.io/api/phrasematcher#add
        """
-        if docs is None or hasattr(docs, "__call__"):  # old API
-            on_match = docs
-            docs = _docs
-
-        _ = self.vocab[key]
-        self._callbacks[key] = on_match
-        self._docs.setdefault(key, set())
-
        cdef MapStruct* current_node
        cdef MapStruct* internal_node
        cdef void* result

-        if isinstance(docs, Doc):
-            raise ValueError(Errors.E179.format(key=key))
-        for doc in docs:
-            if len(doc) == 0:
-                continue
-            if isinstance(doc, Doc):
-                attrs = (TAG, POS, MORPH, LEMMA, DEP)
-                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                for attr in attrs:
-                    if self.attr == attr and not has_annotation[attr]:
-                        if attr == TAG:
-                            pipe = "tagger"
-                        elif attr in (POS, MORPH):
-                            pipe = "morphologizer or tagger+attribute_ruler"
-                        elif attr == LEMMA:
-                            pipe = "lemmatizer"
-                        elif attr == DEP:
-                            pipe = "parser"
-                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
-                        raise ValueError(error_msg)
-                if self._validate and any(has_annotation.values()) \
-                        and self.attr not in attrs:
-                    string_attr = self.vocab.strings[self.attr]
-                    warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
-                keyword = self._convert_to_array(doc)
-            else:
-                keyword = doc
-            self._docs[key].add(tuple(keyword))
+        self._callbacks[key] = on_match
+        for spec in specs:
+            self._docs[key].add(tuple(spec))

            current_node = self.c_map
-            for token in keyword:
+            for token in spec:
                if token == self._terminal_hash:
                    warnings.warn(Warnings.W021)
                    break
@ -233,6 +193,57 @@ cdef class PhraseMatcher:
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)

+
+    def add(self, key, docs, *, on_match=None):
+        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
+        key, a list of one or more patterns, and (optionally) an on_match callback.
+
+        key (str): The match ID.
+        docs (list): List of `Doc` objects representing match patterns.
+        on_match (callable): Callback executed on match.
+
+        If any of the input Docs are invalid, no internal state will be updated.
+
+        DOCS: https://spacy.io/api/phrasematcher#add
+        """
+        if isinstance(docs, Doc):
+            raise ValueError(Errors.E179.format(key=key))
+        if docs is None or not isinstance(docs, List):
+            raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
+        if on_match is not None and not hasattr(on_match, "__call__"):
+            raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
+
+        _ = self.vocab[key]
+        specs = []
+
+        for doc in docs:
+            if len(doc) == 0:
+                continue
+            if not isinstance(doc, Doc):
+                raise ValueError(Errors.E4000.format(type=type(doc)))
+
+            attrs = (TAG, POS, MORPH, LEMMA, DEP)
+            has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+            for attr in attrs:
+                if self.attr == attr and not has_annotation[attr]:
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer or tagger+attribute_ruler"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
+            if self._validate and any(has_annotation.values()) \
+                    and self.attr not in attrs:
+                string_attr = self.vocab.strings[self.attr]
+                warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
+            specs.append(self._convert_to_array(doc))
+
+        self._add_from_arrays(key, specs, on_match=on_match)
+
    def __call__(self, object doclike, *, as_spans=False):
        """Find all sequences matching the supplied patterns on the `Doc`.

@ -345,7 +356,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
    matcher = PhraseMatcher(vocab, attr=attr)
    for key, specs in docs.items():
        callback = callbacks.get(key, None)
-        matcher.add(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
    return matcher


--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
    "update",
    "rehearse",
    "get_loss",
+    "get_teacher_student_loss",
    "initialize",
    "begin_update",
    "finish_update",
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -1,17 +1,20 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import Optional, List, Tuple, Any
 from thinc.types import Floats2d
+from thinc.api import Model
+import warnings

-from ...errors import Errors
+from ...errors import Errors, Warnings
 from ...compat import Literal
 from ...util import registry
-from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
+from ...tokens.doc import Doc
+
+TransitionSystem = Any  # TODO
+State = Any  # TODO


-@registry.architectures("spacy.TransitionBasedParser.v2")
-def build_tb_parser_model(
+@registry.architectures.register("spacy.TransitionBasedParser.v2")
+def transition_parser_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    state_type: Literal["parser", "ner"],
    extra_state_tokens: bool,
@ -19,6 +22,46 @@ def build_tb_parser_model(
    maxout_pieces: int,
    use_upper: bool,
    nO: Optional[int] = None,
+) -> Model:
+    if not use_upper:
+        warnings.warn(Warnings.W400)
+
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v3")
+def transition_parser_v3(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+) -> Model:
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+def build_tb_parser_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
 ) -> Model:
    """
    Build a transition-based parser model. Can apply to NER or dependency-parsing.
@ -51,14 +94,7 @@ def build_tb_parser_model(
        feature sets (for the NER) or 13 (for the parser).
    hidden_width (int): The width of the hidden layer.
    maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
-        is replaced with a ReLu non-linearity if use_upper=True, and no
-        non-linearity if use_upper=False.
-    use_upper (bool): Whether to use an additional hidden layer after the state
-        vector in order to predict the action scores. It is recommended to set
-        this to False for large pretrained models such as transformers, and True
-        for smaller networks. The upper layer is computed on CPU, which becomes
-        a bottleneck on larger GPU-based models, where it's also less necessary.
+        Recommended values are 1, 2 or 3.
    nO (int or None): The number of actions the model will predict between.
        Usually inferred from data at the beginning of training, or loaded from
        disk.
@ -69,106 +105,11 @@ def build_tb_parser_model(
        nr_feature_tokens = 6 if extra_state_tokens else 3
    else:
        raise ValueError(Errors.E917.format(value=state_type))
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(
-        tok2vec,
-        list2array(),
-        Linear(hidden_width, t2v_width),
+    return TransitionModel(
+        tok2vec=tok2vec,
+        state_tokens=nr_feature_tokens,
+        hidden_width=hidden_width,
+        maxout_pieces=maxout_pieces,
+        nO=nO,
+        unseen_classes=set(),
    )
-    tok2vec.set_dim("nO", hidden_width)
-    lower = _define_lower(
-        nO=hidden_width if use_upper else nO,
-        nF=nr_feature_tokens,
-        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
-    )
-    upper = None
-    if use_upper:
-        with use_ops("cpu"):
-            # Initialize weights at zero, as it's a classification layer.
-            upper = _define_upper(nO=nO, nI=None)
-    return TransitionModel(tok2vec, lower, upper, resize_output)
-
-
-def _define_upper(nO, nI):
-    return Linear(nO=nO, nI=nI, init_W=zero_init)
-
-
-def _define_lower(nO, nF, nI, nP):
-    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
-
-
-def resize_output(model, new_nO):
-    if model.attrs["has_upper"]:
-        return _resize_upper(model, new_nO)
-    return _resize_lower(model, new_nO)
-
-
-def _resize_upper(model, new_nO):
-    upper = model.get_ref("upper")
-    if upper.has_dim("nO") is None:
-        upper.set_dim("nO", new_nO)
-        return model
-    elif new_nO == upper.get_dim("nO"):
-        return model
-
-    smaller = upper
-    nI = smaller.maybe_get_dim("nI")
-    with use_ops("cpu"):
-        larger = _define_upper(nO=new_nO, nI=nI)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc2f(new_nO, nI)
-        larger_b = larger.ops.alloc1f(new_nO)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        # Weights are stored in (nr_out, nr_in) format, so we're basically
-        # just adding rows here.
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:old_nO] = smaller_W
-            larger_b[:old_nO] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-    model._layers[-1] = larger
-    model.set_ref("upper", larger)
-    return model
-
-
-def _resize_lower(model, new_nO):
-    lower = model.get_ref("lower")
-    if lower.has_dim("nO") is None:
-        lower.set_dim("nO", new_nO)
-        return model
-
-    smaller = lower
-    nI = smaller.maybe_get_dim("nI")
-    nF = smaller.maybe_get_dim("nF")
-    nP = smaller.maybe_get_dim("nP")
-    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
-        larger_b = larger.ops.alloc2f(new_nO, nP)
-        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        smaller_pad = smaller.get_param("pad")
-        # Copy the old weights and padding into the new layer
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:, 0:old_nO, :, :] = smaller_W
-            larger_pad[:, :, 0:old_nO, :] = smaller_pad
-            larger_b[0:old_nO, :] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-        larger.set_param("pad", larger_pad)
-    model._layers[1] = larger
-    model.set_ref("lower", larger)
-    return model
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -7,7 +7,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 from ...tokens import Doc
 from ...util import registry
 from ...errors import Errors
-from ...ml import _character_embed
+from ...ml import character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
@ -226,7 +226,7 @@ def CharacterEmbed(
    if feature is None:
        raise ValueError(Errors.E911.format(feat=feature))
    char_embed = chain(
-        _character_embed.CharacterEmbed(nM=nM, nC=nC),
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
        cast(Model[List[Floats2d], Ragged], list2ragged()),
    )
    feature_extractor: Model[List[Doc], Ragged] = chain(
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@ -1,49 +0,0 @@
-from libc.string cimport memset, memcpy
-from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
-from ..pipeline._parser_internals._state cimport StateC
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const float* seen_classes
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* scores
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
-
-
-cdef WeightsC get_c_weights(model) except *
-
-cdef SizesC get_c_sizes(model, int batch_size) except *
-
-cdef ActivationsC alloc_activations(SizesC n) nogil
-
-cdef void free_activations(const ActivationsC* A) nogil
-
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@ -1,492 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-cimport numpy as np
-from libc.math cimport exp
-from libc.string cimport memset, memcpy
-from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.linalg cimport Vec, VecVec
-from thinc.backends.cblas cimport saxpy, sgemm
-
-import numpy
-import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
-
-from .. import util
-from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
-from ..pipeline._parser_internals.stateclass cimport StateClass
-
-
-cdef WeightsC get_c_weights(model) except *:
-    cdef WeightsC output
-    cdef precompute_hiddens state2vec = model.state2vec
-    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
-    cdef np.ndarray vec2scores_W
-    cdef np.ndarray vec2scores_b
-    if model.vec2scores is None:
-        output.hidden_weights = NULL
-        output.hidden_bias = NULL
-    else:
-        vec2scores_W = model.vec2scores.get_param("W")
-        vec2scores_b = model.vec2scores.get_param("b")
-        output.hidden_weights = <const float*>vec2scores_W.data
-        output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray class_mask = model._class_mask
-    output.seen_classes = <const float*>class_mask.data
-    return output
-
-
-cdef SizesC get_c_sizes(model, int batch_size) except *:
-    cdef SizesC output
-    output.states = batch_size
-    if model.vec2scores is None:
-        output.classes = model.state2vec.get_dim("nO")
-    else:
-        output.classes = model.vec2scores.get_dim("nO")
-    output.hiddens = model.state2vec.get_dim("nO")
-    output.pieces = model.state2vec.get_dim("nP")
-    output.feats = model.state2vec.get_dim("nF")
-    output.embed_width = model.tokvecs.shape[1]
-    return output
-
-
-cdef ActivationsC alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    resize_activations(&A, n)
-    return A
-
-
-cdef void free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.scores)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
-        A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
-    resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
-    for i in range(n.states):
-        VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
-            W.feat_bias, 1., n.hiddens * n.pieces)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    memset(A.scores, 0, n.states * n.classes * sizeof(float))
-    if W.hidden_weights == NULL:
-        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes],
-                W.hidden_bias, 1., n.classes)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = A.scores[0]
-    for i in range(1, n.states * n.classes):
-        if A.scores[i] < min_:
-            min_ = A.scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if not W.seen_classes[j]:
-                A.scores[i*n.classes+j] = min_
-
-
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
-    cdef const float* feature
-    padding = cached
-    cached += F * O
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
-
-
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
-    """Do multi-label log loss"""
-    cdef double max_, gmax, Z, gZ
-    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
-    if best == -1 or guess == -1:
-        # These shouldn't happen, but if they do, we want to make sure we don't
-        # cause an OOB access.
-        return
-    Z = 1e-10
-    gZ = 1e-10
-    max_ = scores[guess]
-    gmax = scores[best]
-    for i in range(O):
-        Z += exp(scores[i] - max_)
-        if costs[i] <= costs[best]:
-            gZ += exp(scores[i] - gmax)
-    for i in range(O):
-        if costs[i] <= costs[best]:
-            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
-        else:
-            d_scores[i] = exp(scores[i]-max_) / Z
-
-
-cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
-    # Find minimum cost
-    cdef float cost = 1
-    for i in range(n):
-        if is_valid[i] and costs[i] < cost:
-            cost = costs[i]
-    # Now find best-scoring with that cost
-    cdef int best = -1
-    for i in range(n):
-        if costs[i] <= cost and is_valid[i]:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-
-class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
-        Model.__init__(self, name="parser_step_model", forward=step_forward)
-        self.attrs["has_upper"] = has_upper
-        self.attrs["dropout_rate"] = dropout
-        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
-        if layers[1].get_dim("nP") >= 2:
-            activation = "maxout"
-        elif has_upper:
-            activation = None
-        else:
-            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, train=train)
-        if has_upper:
-            self.vec2scores = layers[-1]
-        else:
-            self.vec2scores = None
-        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
-        self.backprops = []
-        self._class_mask = numpy.zeros((self.nO,), dtype='f')
-        self._class_mask.fill(1)
-        if unseen_classes is not None:
-            for class_ in unseen_classes:
-                self._class_mask[class_] = 0.
-
-    def clear_memory(self):
-        del self.tokvecs
-        del self.bp_tokvecs
-        del self.state2vec
-        del self.backprops
-        del self._class_mask
-
-    @property
-    def nO(self):
-        if self.attrs["has_upper"]:
-            return self.vec2scores.get_dim("nO")
-        else:
-            return self.state2vec.get_dim("nO")
-
-    def class_is_unseen(self, class_):
-        return self._class_mask[class_]
-
-    def mark_class_unseen(self, class_):
-        self._class_mask[class_] = 0
-
-    def mark_class_seen(self, class_):
-        self._class_mask[class_] = 1
-
-    def get_token_ids(self, states):
-        cdef StateClass state
-        states = [state for state in states if not state.is_final()]
-        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
-                                          dtype='i', order='C')
-        ids.fill(-1)
-        c_ids = <int*>ids.data
-        for state in states:
-            state.c.set_context_tokens(c_ids, ids.shape[1])
-            c_ids += ids.shape[1]
-        return ids
-
-    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            self.backprops.append((
-                util.get_async(self.cuda_stream, token_ids),
-                util.get_async(self.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
-
-    def finish_steps(self, golds):
-        # Add a padding vector to the d_tokvecs gradient, so that missing
-        # values don't affect the real gradient.
-        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
-        # Tells CUDA to block, so our async copies complete.
-        if self.cuda_stream is not None:
-            self.cuda_stream.synchronize()
-        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids))
-            ids = ids.flatten()
-            d_state_features = d_state_features.reshape(
-                (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
-        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1])
-        return d_tokvecs
-
-NUMPY_OPS = NumpyOps()
-
-def step_forward(model: ParserStepModel, states, is_train):
-    token_ids = model.get_token_ids(states)
-    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    mask = None
-    if model.attrs["has_upper"]:
-        dropout_rate = model.attrs["dropout_rate"]
-        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
-            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
-    else:
-        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
-    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
-
-    def backprop_parser_step(d_scores):
-        # Zero vectors for unseen classes
-        d_scores *= model._class_mask
-        d_vector = get_d_vector(d_scores)
-        if mask is not None:
-            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
-        return None
-    return scores, backprop_parser_step
-
-
-cdef class precompute_hiddens:
-    """Allow a model to be "primed" by pre-computing input features in bulk.
-
-    This is used for the parser, where we want to take a batch of documents,
-    and compute vectors for each (token, position) pair. These vectors can then
-    be reused, especially for beam-search.
-
-    Let's say we're using 12 features for each state, e.g. word at start of
-    buffer, three words on stack, their children, etc. In the normal arc-eager
-    system, a document of length N is processed in 2*N states. This means we'll
-    create 2*N*12 feature vectors --- but if we pre-compute, we only need
-    N*12 vector computations. The saving for beam-search is much better:
-    if we have a beam of k, we'll normally make 2*N*12*K computations --
-    so we can save the factor k. This also gives a nice CPU/GPU division:
-    we can do all our hard maths up front, packed into large multiplications,
-    and do the hard-to-program parsing on the CPU.
-    """
-    cdef readonly int nF, nO, nP
-    cdef bint _is_synchronized
-    cdef public object ops
-    cdef public object numpy_ops
-    cdef np.ndarray _features
-    cdef np.ndarray _cached
-    cdef np.ndarray bias
-    cdef object _cuda_stream
-    cdef object _bp_hiddens
-    cdef object activation
-
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", train=False):
-        gpu_cached, bp_features = lower_model(tokvecs, train)
-        cdef np.ndarray cached
-        if not isinstance(gpu_cached, numpy.ndarray):
-            # Note the passing of cuda_stream here: it lets
-            # cupy make the copy asynchronously.
-            # We then have to block before first use.
-            cached = gpu_cached.get(stream=cuda_stream)
-        else:
-            cached = gpu_cached
-        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
-            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
-        else:
-            self.bias = lower_model.get_param("b")
-        self.nF = cached.shape[1]
-        if lower_model.has_dim("nP"):
-            self.nP = lower_model.get_dim("nP")
-        else:
-            self.nP = 1
-        self.nO = cached.shape[2]
-        self.ops = lower_model.ops
-        self.numpy_ops = NumpyOps()
-        assert activation in (None, "relu", "maxout")
-        self.activation = activation
-        self._is_synchronized = False
-        self._cuda_stream = cuda_stream
-        self._cached = cached
-        self._bp_hiddens = bp_features
-
-    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
-            self._cuda_stream.synchronize()
-            self._is_synchronized = True
-        return <float*>self._cached.data
-
-    def has_dim(self, name):
-        if name == "nF":
-            return self.nF if self.nF is not None else True
-        elif name == "nP":
-            return self.nP if self.nP is not None else True
-        elif name == "nO":
-            return self.nO if self.nO is not None else True
-        else:
-            return False
-
-    def get_dim(self, name):
-        if name == "nF":
-            return self.nF
-        elif name == "nP":
-            return self.nP
-        elif name == "nO":
-            return self.nO
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def set_dim(self, name, value):
-        if name == "nF":
-            self.nF = value
-        elif name == "nP":
-            self.nP = value
-        elif name == "nO":
-            self.nO = value
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def __call__(self, X, bint is_train):
-        if is_train:
-            return self.begin_update(X)
-        else:
-            return self.predict(X), lambda X: X
-
-    def predict(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
-        # This is tricky, but (assuming GPU available);
-        # - Input to forward on CPU
-        # - Output from forward on CPU
-        # - Input to backward on GPU!
-        # - Output from backward on GPU
-        bp_hiddens = self._bp_hiddens
-
-        cdef CBlas cblas
-        if isinstance(self.ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = self.ops.cblas()
-
-        feat_weights = self.get_feat_weights()
-        cdef int[:, ::1] ids = token_ids
-        sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector += self.bias
-        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
-
-        def backward(d_state_vector_ids):
-            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector)
-            d_tokens = bp_hiddens((d_state_vector, token_ids))
-            return d_tokens
-        return state_vector, backward
-
-    def _nonlinearity(self, state_vector):
-        if self.activation == "maxout":
-            return self._maxout_nonlinearity(state_vector)
-        else:
-            return self._relu_nonlinearity(state_vector)
-
-    def _maxout_nonlinearity(self, state_vector):
-        state_vector, mask = self.numpy_ops.maxout(state_vector)
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_maxout(d_best):
-            return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
-        return state_vector, backprop_maxout
-
-    def _relu_nonlinearity(self, state_vector):
-        state_vector = state_vector.reshape((state_vector.shape[0], -1))
-        mask = state_vector >= 0.
-        state_vector *= mask
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_relu(d_best):
-            d_best *= mask
-            return d_best.reshape((d_best.shape + (1,)))
- 
-        return state_vector, backprop_relu
--- a/spacy/ml/tb_framework.pxd
+++ b/spacy/ml/tb_framework.pxd
@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -1,50 +0,0 @@
-from thinc.api import Model, noop
-from .parser_model import ParserStepModel
-from ..util import registry
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,23 +1,41 @@
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
 cimport numpy as np
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr

-from .structs cimport MorphAnalysisC
 from .strings cimport StringStore
 from .typedefs cimport attr_t, hash_t


+cdef cppclass Feature:
+    hash_t field
+    hash_t value
+
+    __init__():
+        this.field = 0
+        this.value = 0
+
+
+cdef cppclass MorphAnalysisC:
+    hash_t key  
+    vector[Feature] features
+
+    __init__():
+        this.key = 0
+
 cdef class Morphology:
-    cdef readonly Pool mem
    cdef readonly StringStore strings
-    cdef PreshMap tags # Keyed by hash, value is pointer to tag
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags

-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)

-
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
-cdef list list_features(const MorphAnalysisC* morph)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,10 +1,10 @@
 # cython: infer_types
 import numpy
 import warnings
+from typing import Union, Tuple, List, Dict, Optional
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr

-from .attrs cimport POS
-
-from .parts_of_speech import IDS as POS_IDS
 from .errors import Warnings
 from . import symbols

@ -24,134 +24,187 @@ cdef class Morphology:
    EMPTY_MORPH = symbols.NAMES[symbols._]

    def __init__(self, StringStore strings):
-        self.mem = Pool()
        self.strings = strings
-        self.tags = PreshMap()

    def __reduce__(self):
        tags = set([self.get(self.strings[s]) for s in self.strings])
        tags -= set([""])
        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)

-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
+        match = self.tags.find(tag_hash)
+        if match != self.tags.const_end():
+            return deref(match).second
+        else:
+            return shared_ptr[MorphAnalysisC]()
+
+    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
+        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
+            attr_key = self.strings.as_string(attr_key)
+            attr_value = self.strings.as_string(attr_value)
+
+            # Preserve multiple values as a list
+            if self.VALUE_SEP in attr_value:
+                values = attr_value.split(self.VALUE_SEP)
+                values.sort()
+                attr_value = values
+        else:
+            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
+            return None
+
+        return attr_key, attr_value
+
+    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
+        if not feats or feats == self.EMPTY_MORPH:
+            return {}
+
+        out = []
+        for feat in feats.split(self.FEATURE_SEP):
+            field, values = feat.split(self.FIELD_SEP, 1)
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
+        out = []
+        for field, values in feats.items():
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+
+    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
+        norm_feats_string = self.FEATURE_SEP.join([
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            for field, values in feats.items()
+        ])
+        return norm_feats_string or self.EMPTY_MORPH
+
+
+    cdef hash_t _add(self, features):
        """Insert a morphological analysis in the morphology table, if not
        already present. The morphological analysis may be provided in the UD
        FEATS format as a string or in the tag map dict format.
        Returns the hash of the new analysis.
        """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
        if isinstance(features, str):
            if features == "":
                features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
-            if tag_ptr != NULL:
-                return tag_ptr.key
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # intified ("Field", "Field=Value") pairs
-        field_feature_pairs = []
-        for field in sorted(string_features):
-            values = string_features[field]
-            for value in values.split(self.VALUE_SEP):
-                field_feature_pairs.append((
-                    self.strings.add(field),
-                    self.strings.add(field + self.FIELD_SEP + value),
-                ))
-        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+
        # the hash key for the tag is either the hash of the normalized UFEATS
        # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
-        tag.key = self.strings.add(norm_feats_string)
-        self.insert(tag)
-        return tag.key
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
+        tag_hash = self.strings.add(norm_feats_string)
+        tag = self._lookup_tag(tag_hash)
+        if tag:
+            return deref(tag).key

-    def normalize_features(self, features):
+        self._intern_morph_tag(tag_hash, features)
+        return tag_hash
+
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
+        # intified ("Field", "Field=Value") pairs where fields with multiple values have
+        # been split into individual tuples, e.g.:
+        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        # ("Field2", "Field2=Value3")]
+        field_feature_pairs = []
+
+        # Feat dict is normalized at this point.
+        for field, values in feats.items():
+            field_key = self.strings.add(field)
+            if isinstance(values, list):
+                for value in values:
+                    value_key = self.strings.add(field + self.FIELD_SEP + value)
+                    field_feature_pairs.append((field_key, value_key))
+            else:
+                # We could box scalar values into a list and use a common
+                # code path to generate features but that incurs a small 
+                # but measurable allocation/iteration overhead (as this
+                # branch is taken often enough).
+                value_key = self.strings.add(field + self.FIELD_SEP + values)
+                field_feature_pairs.append((field_key, value_key))
+
+        num_features = len(field_feature_pairs)
+        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
+        deref(tag).key = tag_key
+        deref(tag).features.resize(num_features)
+
+        for i in range(num_features):
+            deref(tag).features[i].field = field_feature_pairs[i][0]
+            deref(tag).features[i].value = field_feature_pairs[i][1]
+
+        self.tags[tag_key] = tag
+
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
+    cdef str _normalize_features(self, features):
        """Create a normalized FEATS string from a features string or dict.

        features (Union[dict, str]): Features as dict or UFEATS string.
        RETURNS (str): Features as normalized UFEATS string.
        """
        if isinstance(features, str):
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        features = self.normalize_attrs(features)
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(sorted([
-                self.FIELD_SEP.join([field, values])
-            for field, values in string_features.items()
-        ]))
-        return norm_feats_string or self.EMPTY_MORPH

-    def normalize_attrs(self, attrs):
-        """Convert attrs dict so that POS is always by ID, other features are
-        by string. Values separated by VALUE_SEP are sorted.
-        """
-        out = {}
-        attrs = dict(attrs)
-        for key, value in attrs.items():
-            # convert POS value to ID
-            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
-                if isinstance(value, str) and value.upper() in POS_IDS:
-                    value = POS_IDS[value.upper()]
-                elif isinstance(value, int) and value not in POS_IDS.values():
-                    warnings.warn(Warnings.W100.format(feature={key: value}))
-                    continue
-                out[POS] = value
-            # accept any string or ID fields and values and convert to strings
-            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-                key = self.strings.as_string(key)
-                value = self.strings.as_string(value)
-                # sort values
-                if self.VALUE_SEP in value:
-                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
-                out[key] = value
-            else:
-                warnings.warn(Warnings.W100.format(feature={key: value}))
-        return out
+        return self._normalized_feat_dict_to_str(features)

-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
-        """Creates a MorphAnalysisC from a list of intified
-        ("Field", "Field=Value") tuples where fields with multiple values have
-        been split into individual tuples, e.g.:
-        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
-        ("Field2", "Field2=Value3")]
-        """
-        cdef MorphAnalysisC tag
-        tag.length = len(field_feature_pairs)
-        if tag.length > 0:
-            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        for i, (field, feature) in enumerate(field_feature_pairs):
-            tag.fields[i] = field
-            tag.features[i] = feature
-        return tag
+    def add(self, features):
+        return self._add(features)

-    cdef int insert(self, MorphAnalysisC tag) except -1:
-        cdef hash_t key = tag.key
-        if self.tags.get(key) == NULL:
-            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_ptr[0] = tag
-            self.tags.set(key, <void*>tag_ptr)
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)

-    def get(self, hash_t morph):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
-        if tag == NULL:
-            return ""
-        else:
-            return self.strings[tag.key]
+    def normalize_features(self, features):
+        return self._normalize_features(features)

    @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out

    @staticmethod
    def dict_to_feats(feats_dict):
@ -160,34 +213,34 @@ cdef class Morphology:
        return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))


-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
    cdef int i
-    for i in range(morph.length):
-        if morph.features[i] == feature:
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
            return True
    return False


-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
    cdef int i
    features = []
-    for i in range(morph.length):
-        features.append(morph.features[i])
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
    return features


-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
    n = get_n_by_field(<uint64_t*>results.data, morph, field)
    return results[:n]


-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
    cdef int n_results = 0
    cdef int i
-    for i in range(morph.length):
-        if morph.fields[i] == field:
-            results[n_results] = morph.features[i]
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
            n_results += 1
    return n_results

--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -3,22 +3,22 @@ from . cimport symbols
 cpdef enum univ_pos_t:
    NO_TAG = 0
    ADJ = symbols.ADJ
-    ADP
-    ADV
-    AUX
-    CONJ
-    CCONJ # U20
-    DET
-    INTJ
-    NOUN
-    NUM
-    PART
-    PRON
-    PROPN
-    PUNCT
-    SCONJ
-    SYM
-    VERB
-    X
-    EOL
-    SPACE
+    ADP = symbols.ADP
+    ADV = symbols.ADV
+    AUX = symbols.AUX
+    CONJ = symbols.CONJ
+    CCONJ  = symbols.CCONJ  # U20
+    DET = symbols.DET
+    INTJ = symbols.INTJ
+    NOUN = symbols.NOUN
+    NUM = symbols.NUM
+    PART = symbols.PART
+    PRON = symbols.PRON
+    PROPN = symbols.PROPN
+    PUNCT = symbols.PUNCT
+    SCONJ = symbols.SCONJ
+    SYM = symbols.SYM
+    VERB = symbols.VERB
+    X = symbols.X
+    EOL = symbols.EOL
+    SPACE = symbols.SPACE
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,9 +1,8 @@
-from .attributeruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
-from .entityruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .pipe import Pipe
@ -23,7 +22,6 @@ __all__ = [
    "DependencyParser",
    "EntityLinker",
    "EntityRecognizer",
-    "EntityRuler",
    "Morphologizer",
    "Lemmatizer",
    "MultiLabel_TextCategorizer",
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@ -1,6 +1,6 @@
 from ...typedefs cimport class_t, hash_t

-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1

 cdef int check_final_state(void* _state, void* extra_args) except -1
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@ -3,17 +3,17 @@
 cimport numpy as np
 import numpy
 from cpython.ref cimport PyObject, Py_XDECREF
-from thinc.extra.search cimport Beam
-from thinc.extra.search import MaxViolation
-from thinc.extra.search cimport MaxViolation

 from ...typedefs cimport hash_t, class_t
 from .transition_system cimport TransitionSystem, Transition
 from ...errors import Errors
+from .batch cimport Batch
+from .search cimport Beam, MaxViolation
+from .search import MaxViolation
 from .stateclass cimport StateC, StateClass


-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateC*>_dest
    src = <StateC*>_src
@ -27,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
    return state.is_final()


-cdef class BeamBatch(object):
+cdef class BeamBatch(Batch):
    cdef public TransitionSystem moves
    cdef public object states
    cdef public object docs
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -6,7 +6,6 @@ cimport libcpp
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64

 from ...vocab cimport EMPTY_LEXEME
@ -26,7 +25,7 @@ cdef struct ArcC:


 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
    const TokenC* _sent
    vector[int] _stack
    vector[int] _rebuffer
@ -34,31 +33,34 @@ cdef cppclass StateC:
    unordered_map[int, vector[ArcC]] _left_arcs
    unordered_map[int, vector[ArcC]] _right_arcs
    vector[libcpp.bool] _unshiftable
+    vector[int] history
    set[int] _sent_starts
    TokenC _empty_token
    int length
    int offset
    int _b_i

-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
        this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
        this.offset = 0
        this.length = length
        this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
        memset(&this._empty_token, 0, sizeof(TokenC))
        this._empty_token.lex = &EMPTY_LEXEME

-    __dealloc__():
-        free(this._heads)
-
    void set_context_tokens(int* ids, int n) nogil:
        cdef int i, j
        if n == 1:
@ -131,19 +133,20 @@ cdef cppclass StateC:
                ids[i] = -1

    int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
            return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]

    int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
        if i < 0:
            return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
        else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
            if b_i >= this.length:
                return -1
            else:
@ -242,7 +245,7 @@ cdef cppclass StateC:
            return 0
        elif this._sent[word].sent_start == 1:
            return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
            return 1
        else:
            return 0
@ -327,7 +330,7 @@ cdef cppclass StateC:
        if item >= this._unshiftable.size():
            return 0
        else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]

    void set_reshiftable(int item) nogil:
        if item < this._unshiftable.size():
@ -347,6 +350,9 @@ cdef cppclass StateC:
        this._heads[child] = head

    void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
        arcs_it = heads_arcs.find(h_i)
        if arcs_it == heads_arcs.end():
            return
@ -355,12 +361,12 @@ cdef cppclass StateC:
        if arcs.size() == 0:
            return

-        arc = arcs.back()
+        arc = &arcs.back()
        if arc.head == h_i and arc.child == c_i:
            arcs.pop_back()
        else:
            for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                if arc.head == h_i and arc.child == c_i:
                    arc.head = -1
                    arc.child = -1
@ -400,10 +406,11 @@ cdef cppclass StateC:
        this._rebuffer = src._rebuffer
        this._sent_starts = src._sent_starts
        this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
        this._ents = src._ents
        this._left_arcs = src._left_arcs
        this._right_arcs = src._right_arcs
        this._b_i = src._b_i
        this.offset = src.offset
        this._empty_token = src._empty_token
+        this.history = src.history
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -15,7 +15,7 @@ from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC, ArcC
 from ...errors import Errors
-from thinc.extra.search cimport Beam
+from .search cimport Beam

 cdef weight_t MIN_SCORE = -90000
 cdef attr_t SUBTOK_LABEL = hash_string('subtok')
@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem):
        return list(arcs)

    def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
        for word in eg.y[start:end]:
            if word.dep != 0:
                return True
@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem):
                            state.print_state()
                        )))
                    action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                    break
            else:
                failed = False
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,10 +1,11 @@
 import os
 import random
 from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
 from cymem.cymem cimport Pool

 from collections import Counter
-from thinc.extra.search cimport Beam

 from ...tokens.doc cimport Doc
 from ...tokens.span import Span
@ -15,6 +16,7 @@ from ...attrs cimport IS_SPACE
 from ...structs cimport TokenC, SpanC
 from ...training import split_bilu_label
 from ...training.example cimport Example
+from .search cimport Beam
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition, do_func_t
@ -43,9 +45,7 @@ MOVE_NAMES[OUT] = 'O'

 cdef struct GoldNERStateC:
    Transition* ner
-    SpanC* negs
-    int32_t length
-    int32_t nr_neg
+    vector[shared_ptr[SpanC]] negs


 cdef class BiluoGold:
@ -78,8 +78,6 @@ cdef GoldNERStateC create_gold_state(
        negs = []
    assert example.x.length > 0
    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
-    gs.nr_neg = len(negs)
    ner_ents, ner_tags = example.get_aligned_ents_and_ner()
    for i, ner_tag in enumerate(ner_tags):
        gs.ner[i] = moves.lookup_transition(ner_tag)
@ -93,8 +91,8 @@ cdef GoldNERStateC create_gold_state(
    # In order to handle negative samples, we need to maintain the full
    # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
    # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
-        gs.negs[i] = neg.c
+    for neg in negs:
+        gs.negs.push_back(neg.c)
    return gs


@ -158,7 +156,7 @@ cdef class BiluoPushDown(TransitionSystem):
            if token.ent_type:
                labels.add(token.ent_type_)
        return labels
-    
+
    def move_name(self, int move, attr_t label):
        if move == OUT:
            return 'O'
@ -308,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem):
            for span in eg.y.spans.get(neg_key, []):
                if span.start >= start and span.end <= end:
                    return True
+        if end is not None and end < 0:
+            end = None
        for word in eg.y[start:end]:
            if word.ent_iob != 0:
                return True
@ -411,6 +411,8 @@ cdef class Begin:
        cdef int g_act = gold.ner[b0].move
        cdef attr_t g_tag = gold.ner[b0].label

+        cdef shared_ptr[SpanC] span
+
        if g_act == MISSING:
            pass
        elif g_act == BEGIN:
@ -428,8 +430,8 @@ cdef class Begin:
            # be correct or not. However, we can at least tell whether we're
            # going to be opening an entity where there's only one possible
            # L.
-            for span in gold.negs[:gold.nr_neg]:
-                if span.label == label and span.start == b0:
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
                    cost += 1
                    break
        return cost
@ -574,8 +576,9 @@ cdef class Last:
        # If we have negative-example entities, integrate them into the objective,
        # by marking actions that close an entity that we know is incorrect
        # as costly.
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                cost += 1
                break
        return cost
@ -639,12 +642,13 @@ cdef class Unit:
        # This is fairly straight-forward for U- entities, as we have a single
        # action
        cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                cost += 1
                break
        return cost
- 
+


 cdef class Out:
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@ -0,0 +1,89 @@
+from cymem.cymem cimport Pool
+
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+from libcpp.pair cimport pair
+from libcpp.queue cimport priority_queue
+from libcpp.vector cimport vector
+
+from ...typedefs cimport class_t, weight_t, hash_t
+
+ctypedef pair[weight_t, size_t] Entry
+ctypedef priority_queue[Entry] Queue
+
+
+ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
+
+ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
+
+ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
+
+ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
+
+ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
+
+
+cdef struct _State:
+    void* content
+    class_t* hist
+    weight_t score
+    weight_t loss
+    int i
+    int t
+    bint is_done
+
+
+cdef class Beam:
+    cdef Pool mem
+    cdef class_t nr_class
+    cdef class_t width
+    cdef class_t size
+    cdef public weight_t min_density
+    cdef int t
+    cdef readonly bint is_done
+    cdef list histories
+    cdef list _parent_histories
+    cdef weight_t** scores
+    cdef int** is_valid
+    cdef weight_t** costs
+    cdef _State* _parents
+    cdef _State* _states
+    cdef del_func_t del_func
+
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
+
+    cdef inline void* at(self, int i) nogil:
+        return self._states[i].content
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
+ 
+
+    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
+        self.scores[i][j] = score
+        self.is_valid[i][j] = is_valid
+        self.costs[i][j] = cost
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
+
+
+cdef class MaxViolation:
+    cdef Pool mem
+    cdef weight_t cost
+    cdef weight_t delta
+    cdef readonly weight_t p_score
+    cdef readonly weight_t g_score
+    cdef readonly double Z
+    cdef readonly double gZ
+    cdef class_t n
+    cdef readonly list p_hist
+    cdef readonly list g_hist
+    cdef readonly list p_probs
+    cdef readonly list g_probs
+
+    cpdef int check(self, Beam pred, Beam gold) except -1
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@ -0,0 +1,306 @@
+# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+cimport cython
+from libc.string cimport memset, memcpy
+from libc.math cimport log, exp
+import math
+
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
+
+cdef class Beam:
+    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
+        assert nr_class != 0
+        assert width != 0
+        self.nr_class = nr_class
+        self.width = width
+        self.min_density = min_density
+        self.size = 1
+        self.t = 0
+        self.mem = Pool()
+        self.del_func = NULL
+        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        cdef int i
+        self.histories = [[] for i in range(self.width)]
+        self._parent_histories = [[] for i in range(self.width)]
+
+        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        for i in range(self.width):
+            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
+            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+
+    def __len__(self):
+        return self.size
+
+    property score:
+        def __get__(self):
+            return self._states[0].score
+
+    property min_score:
+        def __get__(self):
+            return self._states[self.size-1].score
+
+    property loss:
+        def __get__(self):
+            return self._states[0].loss
+
+    property probs:
+        def __get__(self):
+            return _softmax([self._states[i].score for i in range(self.size)])
+
+    property scores:
+        def __get__(self):
+            return [self._states[i].score for i in range(self.size)]
+
+    property histories:
+        def __get__(self):
+            return self.histories
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1:
+        cdef int j
+        for j in range(self.nr_class):
+            self.scores[i][j] = scores[j]
+            self.is_valid[i][j] = is_valid[j]
+            self.costs[i][j] = costs[j]
+
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
+        cdef int i, j
+        for i in range(self.width):
+            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
+            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
+            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
+        for i in range(self.width):
+            self._states[i].content = init_func(self.mem, n, extra_args)
+            self._parents[i].content = init_func(self.mem, n, extra_args)
+        self.del_func = del_func
+
+    def __dealloc__(self):
+        if self.del_func == NULL:
+            return
+
+        for i in range(self.width):
+            self.del_func(self.mem, self._states[i].content, NULL)
+            self.del_func(self.mem, self._parents[i].content, NULL)
+
+    @cython.cdivision(True)
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1:
+        cdef weight_t** scores = self.scores
+        cdef int** is_valid = self.is_valid
+        cdef weight_t** costs = self.costs
+
+        cdef Queue* q = new Queue()
+        self._fill(q, scores, is_valid)
+        # For a beam of width k, we only ever need 2k state objects. How?
+        # Each transition takes a parent and a class and produces a new state.
+        # So, we don't need the whole history --- just the parent. So at
+        # each step, we take a parent, and apply one or more extensions to
+        # it.
+        self._parents, self._states = self._states, self._parents
+        self._parent_histories, self.histories = self.histories, self._parent_histories
+        cdef weight_t score
+        cdef int p_i
+        cdef int i = 0
+        cdef class_t clas
+        cdef _State* parent
+        cdef _State* state
+        cdef hash_t key
+        cdef PreshMap seen_states = PreshMap(self.width)
+        cdef uint64_t is_seen
+        cdef uint64_t one = 1
+        while i < self.width and not q.empty():
+            data = q.top()
+            p_i = data.second / self.nr_class
+            clas = data.second % self.nr_class
+            score = data.first
+            q.pop()
+            parent = &self._parents[p_i]
+            # Indicates terminal state reached; i.e. state is done
+            if parent.is_done:
+                # Now parent will not be changed, so we don't have to copy.
+                # Once finished, should also be unbranching.
+                self._states[i], parent[0] = parent[0], self._states[i]
+                parent.i = self._states[i].i
+                parent.t = self._states[i].t
+                parent.is_done = self._states[i].t
+                self._states[i].score = score
+                self.histories[i] = list(self._parent_histories[p_i])
+                i += 1
+            else:
+                state = &self._states[i]
+                # The supplied transition function should adjust the destination
+                # state to be the result of applying the class to the source state
+                transition_func(state.content, parent.content, clas, extra_args)
+                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
+                is_seen = <uint64_t>seen_states.get(key)
+                if key == 0 or key == 1 or not is_seen:
+                    if key != 0 and key != 1:
+                        seen_states.set(key, <void*>one)
+                    state.score = score
+                    state.loss = parent.loss + costs[p_i][clas]
+                    self.histories[i] = list(self._parent_histories[p_i])
+                    self.histories[i].append(clas)
+                    i += 1
+        del q
+        self.size = i
+        assert self.size >= 1
+        for i in range(self.width):
+            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
+        self.t += 1
+
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
+        cdef int i
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self.is_done = False
+                break
+        else:
+            self.is_done = True
+
+    @cython.cdivision(True)
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
+        """Populate the queue from a k * n matrix of scores, where k is the
+        beam-width, and n is the number of classes.
+        """
+        cdef Entry entry
+        cdef weight_t score
+        cdef _State* s
+        cdef int i, j, move_id
+        assert self.size >= 1
+        cdef vector[Entry] entries
+        for i in range(self.size):
+            s = &self._states[i]
+            move_id = i * self.nr_class
+            if s.is_done:
+                # Update score by path average, following TACL '13 paper.
+                if self.histories[i]:
+                    entry.first = s.score + (s.score / self.t)
+                else:
+                    entry.first = s.score
+                entry.second = move_id
+                entries.push_back(entry)
+            else:
+                for j in range(self.nr_class):
+                    if is_valid[i][j]:
+                        entry.first = s.score + scores[i][j]
+                        entry.second = move_id + j
+                        entries.push_back(entry)
+        cdef double max_, Z, cutoff
+        if self.min_density == 0.0:
+            for i in range(entries.size()):
+                q.push(entries[i])
+        elif not entries.empty():
+            max_ = entries[0].first
+            Z = 0.
+            cutoff = 0.
+            # Softmax into probabilities, so we can prune
+            for i in range(entries.size()):
+                if entries[i].first > max_:
+                    max_ = entries[i].first
+            for i in range(entries.size()):
+                Z += exp(entries[i].first-max_)
+            cutoff = (1. / Z) * self.min_density
+            for i in range(entries.size()):
+                prob = exp(entries[i].first-max_) / Z
+                if prob >= cutoff:
+                    q.push(entries[i])
+
+
+cdef class MaxViolation:
+    def __init__(self):
+        self.p_score = 0.0
+        self.g_score = 0.0
+        self.Z = 0.0
+        self.gZ = 0.0
+        self.delta = -1
+        self.cost = 0
+        self.p_hist = []
+        self.g_hist = []
+        self.p_probs = []
+        self.g_probs = []
+
+    cpdef int check(self, Beam pred, Beam gold) except -1:
+        cdef _State* p = &pred._states[0]
+        cdef _State* g = &gold._states[0]
+        cdef weight_t d = p.score - g.score
+        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
+            self.cost = p.loss
+            self.delta = d
+            self.p_hist = list(pred.histories[0])
+            self.g_hist = list(gold.histories[0])
+            self.p_score = p.score
+            self.g_score = g.score
+            self.Z = 1e-10
+            self.gZ = 1e-10
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    self.Z += exp(pred._states[i].score)
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    prob = exp(gold._states[i].score)
+                    self.Z += prob
+                    self.gZ += prob
+
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
+        d = pred.score - gold.score
+        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
+        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
+            p_hist = []
+            p_scores = []
+            g_hist = []
+            g_scores = []
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    p_scores.append(pred._states[i].score)
+                    p_hist.append(list(pred.histories[i]))
+                # This can happen from non-monotonic actions
+                # If we find a better gold analysis this way, be sure to keep it.
+                elif pred._states[i].loss <= 0 \
+                and tuple(pred.histories[i]) not in seen_golds:
+                    g_scores.append(pred._states[i].score)
+                    g_hist.append(list(pred.histories[i]))
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    g_scores.append(gold._states[i].score)
+                    g_hist.append(list(gold.histories[i]))
+
+            all_probs = _softmax(p_scores + g_scores)
+            p_probs = all_probs[:len(p_scores)]
+            g_probs_all = all_probs[len(p_scores):]
+            g_probs = _softmax(g_scores)
+
+            self.cost = pred.loss
+            self.delta = d
+            self.p_hist = p_hist
+            self.g_hist = g_hist
+            # TODO: These variables are misnamed! These are the gradients of the loss.
+            self.p_probs = p_probs
+            # Intuition here:
+            # The gradient of the loss is:
+            # P(model) - P(truth)
+            # Normally, P(truth) is 1 for the gold
+            # But, if we want to do the "partial credit" scheme, we want
+            # to create a distribution over the gold, proportional to the scores
+            # awarded.
+            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
+
+
+def _softmax(nums):
+    if not nums:
+        return []
+    max_ = max(nums)
+    nums = [(exp(n-max_) if n is not None else None) for n in nums]
+    Z = sum(n for n in nums if n is not None)
+    return [(n/Z if n is not None else None) for n in nums]
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@ -20,6 +20,10 @@ cdef class StateClass:
        if self._borrowed != 1:
            del self.c

+    @property
+    def history(self):
+        return list(self.c.history)
+
    @property
    def stack(self):
        return [self.S(i) for i in range(self.c.stack_depth())]
@ -176,3 +180,6 @@ cdef class StateClass:

    def clone(self, StateClass src):
        self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@ -53,3 +53,10 @@ cdef class TransitionSystem:

    cdef int set_costs(self, int* is_valid, weight_t* costs,
                       const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -1,6 +1,8 @@
 # cython: infer_types=True
 from __future__ import print_function
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector

 from collections import Counter
 import srsly
@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t
 from ...tokens.doc cimport Doc
 from ...structs cimport TokenC
 from .stateclass cimport StateClass
+from ._parser_utils cimport arg_max_if_valid

 from ...errors import Errors
 from ... import util
@ -73,7 +76,18 @@ cdef class TransitionSystem:
            offset += len(doc)
        return states

+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
    def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
        states, golds, _ = self.init_gold_batch([example])
        if not states:
            return []
@ -85,6 +99,8 @@ cdef class TransitionSystem:
            return self.get_oracle_sequence_from_state(state, gold)

    def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
        cdef Pool mem = Pool()
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        assert self.n_moves > 0
@ -110,6 +126,7 @@ cdef class TransitionSystem:
                            "S0 head?", str(state.has_head(state.S(0))),
                        )))
                    action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                    break
            else:
                if _debug:
@ -137,6 +154,28 @@ cdef class TransitionSystem:
            raise ValueError(Errors.E170.format(name=name))
        action = self.lookup_transition(name)
        action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]

    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError
@ -250,3 +289,35 @@ cdef class TransitionSystem:
            self.cfg.update(msg['cfg'])
        self.initialize_actions(labels)
        return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@ -11,7 +11,7 @@ from ..matcher import Matcher
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
-from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
+from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
 from ..vocab import Vocab
 from ..util import SimpleFrozenList, registry
 from .. import util
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config

 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
-from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager

 from .functions import merge_subtokens
 from ..language import Language
@ -18,12 +18,11 @@ from ..util import registry

 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true

 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -123,6 +122,7 @@ def make_parser(
        scorer=scorer,
    )

+
@Language.factory(
    "beam_parser",
    assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@ -228,6 +228,7 @@ def parser_score(examples, **kwargs):

    DOCS: https://spacy.io/api/dependencyparser#score
    """
+
    def has_sents(doc):
        return doc.has_annotation("SENT_START")

@ -235,8 +236,11 @@ def parser_score(examples, **kwargs):
        dep = getattr(token, attr)
        dep = token.vocab.strings.as_string(dep).lower()
        return dep
+
    results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
    kwargs.setdefault("getter", dep_getter)
    kwargs.setdefault("ignore_labels", ("p", "punct"))
    results.update(Scorer.score_deps(examples, "dep", **kwargs))
@ -249,11 +253,12 @@ def make_parser_scorer():
    return parser_score


-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
    """Pipeline component for dependency parsing.

    DOCS: https://spacy.io/api/dependencyparser
    """
+
    TransitionSystem = ArcEager

    def __init__(
@ -273,8 +278,7 @@ cdef class DependencyParser(Parser):
        incorrect_spans_key=None,
        scorer=parser_score,
    ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
        super().__init__(
            vocab,
            model,
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -1,12 +1,13 @@
-from typing import cast, Any, Callable, Dict, Iterable, List, Optional
+from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union
 from typing import Tuple
 from collections import Counter
 from itertools import islice
 import numpy as np

 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d, Ints2d
+from thinc.api import Config, Model
+from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.legacy import LegacySequenceCategoricalCrossentropy

 from ._edit_tree_internals.edit_trees import EditTrees
 from ._edit_tree_internals.schemas import validate_edit_tree
@ -20,6 +21,9 @@ from ..vocab import Vocab
 from .. import util


+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
+
 default_model_config = """
 [model]
@architectures = "spacy.Tagger.v2"
@ -48,6 +52,7 @@ DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["mo
        "overwrite": False,
        "top_k": 1,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+        "save_activations": False,
    },
    default_score_weights={"lemma_acc": 1.0},
 )
@ -60,6 +65,7 @@ def make_edit_tree_lemmatizer(
    overwrite: bool,
    top_k: int,
    scorer: Optional[Callable],
+    save_activations: bool,
 ):
    """Construct an EditTreeLemmatizer component."""
    return EditTreeLemmatizer(
@ -71,6 +77,7 @@ def make_edit_tree_lemmatizer(
        overwrite=overwrite,
        top_k=top_k,
        scorer=scorer,
+        save_activations=save_activations,
    )


@ -90,6 +97,7 @@ class EditTreeLemmatizer(TrainablePipe):
        overwrite: bool = False,
        top_k: int = 1,
        scorer: Optional[Callable] = lemmatizer_score,
+        save_activations: bool = False,
    ):
        """
        Construct an edit tree lemmatizer.
@ -101,6 +109,7 @@ class EditTreeLemmatizer(TrainablePipe):
            frequency in the training data.
        overwrite (bool): overwrite existing lemma annotations.
        top_k (int): try to apply at most the k most probable edit trees.
+        save_activations (bool): save model activations in Doc when annotating.
        """
        self.vocab = vocab
        self.model = model
@ -115,12 +124,15 @@ class EditTreeLemmatizer(TrainablePipe):

        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer
+        self.save_activations = save_activations

    def get_loss(
        self, examples: Iterable[Example], scores: List[Floats2d]
    ) -> Tuple[float, List[Floats2d]]:
        validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+        loss_func = LegacySequenceCategoricalCrossentropy(
+            normalize=False, missing_value=-1
+        )

        truths = []
        for eg in examples:
@ -143,19 +155,43 @@ class EditTreeLemmatizer(TrainablePipe):

        return float(loss), d_scores

-    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        n_docs = len(list(docs))
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+            guesses: List[Ints1d] = [
+                self.model.ops.alloc((0,), dtype="i") for doc in docs
+            ]
+            scores: List[Floats2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
            assert len(guesses) == n_docs
-            return guesses
+            return {"probabilities": scores, "tree_ids": guesses}
        scores = self.model.predict(docs)
        assert len(scores) == n_docs
        guesses = self._scores2guesses(docs, scores)
        assert len(guesses) == n_docs
-        return guesses
+        return {"probabilities": scores, "tree_ids": guesses}

    def _scores2guesses(self, docs, scores):
        guesses = []
@ -183,8 +219,13 @@ class EditTreeLemmatizer(TrainablePipe):

        return guesses

-    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+        batch_tree_ids = activations["tree_ids"]
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tree_ids = batch_tree_ids[i]
            if hasattr(doc_tree_ids, "get"):
                doc_tree_ids = doc_tree_ids.get()
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,5 +1,7 @@
-from typing import Optional, Iterable, Callable, Dict, Union, List, Any
-from thinc.types import Floats2d
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from pathlib import Path
 from itertools import islice
 import srsly
@ -21,6 +23,11 @@ from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer

+
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
+
+KNOWLEDGE_BASE_IDS = "kb_ids"
+
 # See #9050
 BACKWARD_OVERWRITE = True

@ -59,6 +66,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "use_gold_ents": True,
        "candidates_batch_size": 1,
        "threshold": None,
+        "save_activations": False,
    },
    default_score_weights={
        "nel_micro_f": 1.0,
@ -85,6 +93,7 @@ def make_entity_linker(
    use_gold_ents: bool,
    candidates_batch_size: int,
    threshold: Optional[float] = None,
+    save_activations: bool,
 ):
    """Construct an EntityLinker component.

@ -107,6 +116,7 @@ def make_entity_linker(
    candidates_batch_size (int): Size of batches for entity candidate generation.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
        prediction is discarded. If None, predictions are not filtered by any threshold.
+    save_activations (bool): save model activations in Doc when annotating.
    """

    if not model.attrs.get("include_span_maker", False):
@ -140,6 +150,7 @@ def make_entity_linker(
        use_gold_ents=use_gold_ents,
        candidates_batch_size=candidates_batch_size,
        threshold=threshold,
+        save_activations=save_activations,
    )


@ -180,6 +191,7 @@ class EntityLinker(TrainablePipe):
        use_gold_ents: bool,
        candidates_batch_size: int,
        threshold: Optional[float] = None,
+        save_activations: bool = False,
    ) -> None:
        """Initialize an entity linker.

@ -234,6 +246,7 @@ class EntityLinker(TrainablePipe):
        self.use_gold_ents = use_gold_ents
        self.candidates_batch_size = candidates_batch_size
        self.threshold = threshold
+        self.save_activations = save_activations

        if candidates_batch_size < 1:
            raise ValueError(Errors.E1044)
@ -422,7 +435,7 @@ class EntityLinker(TrainablePipe):
        loss = loss / len(entity_encodings)
        return float(loss), out

-    def predict(self, docs: Iterable[Doc]) -> List[str]:
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.
@ -435,13 +448,20 @@ class EntityLinker(TrainablePipe):
        self.validate_kb()
        entity_count = 0
        final_kb_ids: List[str] = []
-        xp = self.model.ops.xp
+        ops = self.model.ops
+        xp = ops.xp
+        docs_ents: List[Ragged] = []
+        docs_scores: List[Ragged] = []
        if not docs:
-            return final_kb_ids
+            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
        if isinstance(docs, Doc):
            docs = [docs]
-        for i, doc in enumerate(docs):
+        for doc in docs:
+            doc_ents: List[Ints1d] = []
+            doc_scores: List[Floats1d] = []
            if len(doc) == 0:
+                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                continue
            sentences = [s for s in doc.sents]

@ -489,14 +509,32 @@ class EntityLinker(TrainablePipe):
                    if ent.label_ in self.labels_discard:
                        # ignoring this entity - setting to NIL
                        final_kb_ids.append(self.NIL)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[0.0],
+                            ents=[0],
+                        )
                    else:
                        candidates = list(batch_candidates[j])
                        if not candidates:
                            # no prediction possible for this entity - setting to NIL
                            final_kb_ids.append(self.NIL)
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=[0.0],
+                                ents=[0],
+                            )
                        elif len(candidates) == 1 and self.threshold is None:
                            # shortcut for efficiency reasons: take the 1 candidate
                            final_kb_ids.append(candidates[0].entity_)
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=[1.0],
+                                ents=[candidates[0].entity_],
+                            )
                        else:
                            random.shuffle(candidates)
                            # set all prior probabilities to 0 if incl_prior=False
@ -530,28 +568,48 @@ class EntityLinker(TrainablePipe):
                                or scores.max() >= self.threshold
                                else EntityLinker.NIL
                            )
-
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=scores,
+                                ents=[c.entity for c in candidates],
+                            )
+            self._add_doc_activations(
+                docs_scores=docs_scores,
+                docs_ents=docs_ents,
+                doc_scores=doc_scores,
+                doc_ents=doc_ents,
+            )
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
            )
            raise RuntimeError(err)
-        return final_kb_ids
+        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}

-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
+                                 by EntityLinker.predict.

        DOCS: https://spacy.io/api/entitylinker#set_annotations
        """
+        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
-        for doc in docs:
+        for j, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    if act_name != KNOWLEDGE_BASE_IDS:
+                        # We only copy activations that are Ragged.
+                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
@ -650,3 +708,32 @@ class EntityLinker(TrainablePipe):

    def add_label(self, label):
        raise NotImplementedError
+
+    def _add_doc_activations(
+        self,
+        *,
+        docs_scores: List[Ragged],
+        docs_ents: List[Ragged],
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+    def _add_activations(
+        self,
+        *,
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+        scores: Sequence[float],
+        ents: Sequence[int],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        doc_scores.append(ops.asarray1f(scores))
+        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -1,541 +0,0 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
-import warnings
-from collections import defaultdict
-from pathlib import Path
-import srsly
-
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..matcher import Matcher, PhraseMatcher
-from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import get_ner_prf
-
-
-DEFAULT_ENT_ID_SEP = "||"
-PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
-
-
-@Language.factory(
-    "entity_ruler",
-    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
-    default_config={
-        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
-        "validate": False,
-        "overwrite_ents": False,
-        "ent_id_sep": DEFAULT_ENT_ID_SEP,
-        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-    },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
-)
-def make_entity_ruler(
-    nlp: Language,
-    name: str,
-    phrase_matcher_attr: Optional[Union[int, str]],
-    matcher_fuzzy_compare: Callable,
-    validate: bool,
-    overwrite_ents: bool,
-    ent_id_sep: str,
-    scorer: Optional[Callable],
-):
-    return EntityRuler(
-        nlp,
-        name,
-        phrase_matcher_attr=phrase_matcher_attr,
-        matcher_fuzzy_compare=matcher_fuzzy_compare,
-        validate=validate,
-        overwrite_ents=overwrite_ents,
-        ent_id_sep=ent_id_sep,
-        scorer=scorer,
-    )
-
-
-def entity_ruler_score(examples, **kwargs):
-    return get_ner_prf(examples)
-
-
-@registry.scorers("spacy.entity_ruler_scorer.v1")
-def make_entity_ruler_scorer():
-    return entity_ruler_score
-
-
-class EntityRuler(Pipe):
-    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
-    rules or exact phrase matches. It can be combined with the statistical
-    `EntityRecognizer` to boost accuracy, or used on its own to implement a
-    purely rule-based entity recognition system. After initialization, the
-    component is typically added to the pipeline using `nlp.add_pipe`.
-
-    DOCS: https://spacy.io/api/entityruler
-    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
-    """
-
-    def __init__(
-        self,
-        nlp: Language,
-        name: str = "entity_ruler",
-        *,
-        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = levenshtein_compare,
-        validate: bool = False,
-        overwrite_ents: bool = False,
-        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
-        patterns: Optional[List[PatternType]] = None,
-        scorer: Optional[Callable] = entity_ruler_score,
-    ) -> None:
-        """Initialize the entity ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        name (str): Instance name of the current pipeline component. Typically
-            passed in automatically from the factory when the component is
-            added. Used to disable the current entity ruler while creating
-            phrase patterns with the nlp object.
-        phrase_matcher_attr (int / str): Token attribute to match on, passed
-            to the internal PhraseMatcher as `attr`.
-        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
-            internal Matcher. Defaults to
-            spacy.matcher.levenshtein.levenshtein_compare.
-        validate (bool): Whether patterns should be validated, passed to
-            Matcher and PhraseMatcher as `validate`
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        ent_id_sep (str): Separator used internally for entity IDs.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            spacy.scorer.get_ner_prf.
-
-        DOCS: https://spacy.io/api/entityruler#init
-        """
-        self.nlp = nlp
-        self.name = name
-        self.overwrite = overwrite_ents
-        self.token_patterns = defaultdict(list)  # type: ignore
-        self.phrase_patterns = defaultdict(list)  # type: ignore
-        self._validate = validate
-        self.matcher_fuzzy_compare = matcher_fuzzy_compare
-        self.matcher = Matcher(
-            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
-        )
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
-        self.ent_id_sep = ent_id_sep
-        self._ent_ids = defaultdict(tuple)  # type: ignore
-        if patterns is not None:
-            self.add_patterns(patterns)
-        self.scorer = scorer
-
-    def __len__(self) -> int:
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label: str) -> bool:
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-
-        DOCS: https://spacy.io/api/entityruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-
-        final_matches = set(
-            [(m_id, start, end) for m_id, start, end in matches if start != end]
-        )
-        get_sort_key = lambda m: (m[2] - m[1], -m[1])
-        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
-        return final_matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if match_id in self._ent_ids:
-                    label, ent_id = self._ent_ids[match_id]
-                    span = Span(doc, start, end, label=label, span_id=ent_id)
-                else:
-                    span = Span(doc, start, end, label=match_id)
-                new_entities.append(span)
-                entities = [
-                    e for e in entities if not (e.start < end and e.end > start)
-                ]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-
-        DOCS: https://spacy.io/api/entityruler#labels
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_labels = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                label, _ = self._split_label(l)
-                all_labels.add(label)
-            else:
-                all_labels.add(l)
-        return tuple(sorted(all_labels))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Sequence[PatternType]] = None,
-    ):
-        """Initialize the pipe for training.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        patterns Optional[Iterable[PatternType]]: The list of patterns.
-
-        DOCS: https://spacy.io/api/entityruler#initialize
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)  # type: ignore[arg-type]
-
-    @property
-    def ent_ids(self) -> Tuple[Optional[str], ...]:
-        """All entity ids present in the match patterns `id` properties
-
-        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#ent_ids
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_ent_ids = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                _, ent_id = self._split_label(l)
-                all_ent_ids.add(ent_id)
-        return tuple(all_ent_ids)
-
-    @property
-    def patterns(self) -> List[PatternType]:
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-
-        DOCS: https://spacy.io/api/entityruler#patterns
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern.text}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        return all_patterns
-
-    def add_patterns(self, patterns: List[PatternType]) -> None:
-        """Add patterns to the entity ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-
-        DOCS: https://spacy.io/api/entityruler#add_patterns
-        """
-
-        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-        try:
-            current_index = -1
-            for i, (name, pipe) in enumerate(self.nlp.pipeline):
-                if self == pipe:
-                    current_index = i
-                    break
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
-        except ValueError:
-            subsequent_pipes = []
-        with self.nlp.select_pipes(disable=subsequent_pipes):
-            token_patterns = []
-            phrase_pattern_labels = []
-            phrase_pattern_texts = []
-            phrase_pattern_ids = []
-            for entry in patterns:
-                if isinstance(entry["pattern"], str):
-                    phrase_pattern_labels.append(entry["label"])
-                    phrase_pattern_texts.append(entry["pattern"])
-                    phrase_pattern_ids.append(entry.get("id"))
-                elif isinstance(entry["pattern"], list):
-                    token_patterns.append(entry)
-            phrase_patterns = []
-            for label, pattern, ent_id in zip(
-                phrase_pattern_labels,
-                self.nlp.pipe(phrase_pattern_texts),
-                phrase_pattern_ids,
-            ):
-                phrase_pattern = {"label": label, "pattern": pattern}
-                if ent_id:
-                    phrase_pattern["id"] = ent_id
-                phrase_patterns.append(phrase_pattern)
-            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
-                label = entry["label"]  # type: ignore
-                if "id" in entry:
-                    ent_label = label
-                    label = self._create_label(label, entry["id"])
-                    key = self.matcher._normalize_key(label)
-                    self._ent_ids[key] = (ent_label, entry["id"])
-                pattern = entry["pattern"]  # type: ignore
-                if isinstance(pattern, Doc):
-                    self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])  # type: ignore
-                elif isinstance(pattern, list):
-                    self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
-                else:
-                    raise ValueError(Errors.E097.format(pattern=pattern))
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(
-            self.nlp.vocab,
-            validate=self._validate,
-            fuzzy_compare=self.matcher_fuzzy_compare,
-        )
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
-
-    def remove(self, ent_id: str) -> None:
-        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
-
-        ent_id (str): id of the pattern to be removed
-        RETURNS: None
-        DOCS: https://spacy.io/api/entityruler#remove
-        """
-        label_id_pairs = [
-            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
-        ]
-        if not label_id_pairs:
-            raise ValueError(
-                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
-            )
-        created_labels = [
-            self._create_label(label, eid) for (label, eid) in label_id_pairs
-        ]
-        # remove the patterns from self.phrase_patterns
-        self.phrase_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.phrase_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        self.token_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.token_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        for label in created_labels:
-            if label in self.phrase_matcher:
-                self.phrase_matcher.remove(label)
-            else:
-                self.matcher.remove(label)
-
-    def _require_patterns(self) -> None:
-        """Raise a warning if this component has no patterns defined."""
-        if len(self) == 0:
-            warnings.warn(Warnings.W036.format(name=self.name))
-
-    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
-        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
-
-        label (str): The value of label in a pattern entry
-        RETURNS (tuple): ent_label, ent_id
-        """
-        if self.ent_id_sep in label:
-            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
-        else:
-            ent_label = label
-            ent_id = None  # type: ignore
-        return ent_label, ent_id
-
-    def _create_label(self, label: Any, ent_id: Any) -> str:
-        """Join Entity label with ent_id if the pattern has an `id` attribute
-        If ent_id is not a string, the label is returned as is.
-
-        label (str): The label to set for ent.label_
-        ent_id (str): The label
-        RETURNS (str): The ent_label joined with configured `ent_id_sep`
-        """
-        if isinstance(ent_id, str):
-            label = f"{label}{self.ent_id_sep}{ent_id}"
-        return label
-
-    def from_bytes(
-        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_bytes
-        """
-        cfg = srsly.msgpack_loads(patterns_bytes)
-        self.clear()
-        if isinstance(cfg, dict):
-            self.add_patterns(cfg.get("patterns", cfg))
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab,
-                attr=self.phrase_matcher_attr,
-            )
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-        else:
-            self.add_patterns(cfg)
-        return self
-
-    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-
-        DOCS: https://spacy.io/api/entityruler#to_bytes
-        """
-        serial = {
-            "overwrite": self.overwrite,
-            "ent_id_sep": self.ent_id_sep,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "patterns": self.patterns,
-        }
-        return srsly.msgpack_dumps(serial)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (str / Path): The JSONL file to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_disk
-        """
-        path = ensure_path(path)
-        self.clear()
-        depr_patterns_path = path.with_suffix(".jsonl")
-        if path.suffix == ".jsonl":  # user provides a jsonl
-            if path.is_file:
-                patterns = srsly.read_jsonl(path)
-                self.add_patterns(patterns)
-            else:
-                raise ValueError(Errors.E1023.format(path=path))
-        elif depr_patterns_path.is_file():
-            patterns = srsly.read_jsonl(depr_patterns_path)
-            self.add_patterns(patterns)
-        elif path.is_dir():  # path is a valid directory
-            cfg = {}
-            deserializers_patterns = {
-                "patterns": lambda p: self.add_patterns(
-                    srsly.read_jsonl(p.with_suffix(".jsonl"))
-                )
-            }
-            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
-            from_disk(path, deserializers_cfg, {})
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
-            from_disk(path, deserializers_patterns, {})
-        else:  # path is not a valid directory or file
-            raise ValueError(Errors.E146.format(path=path))
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (str / Path): The JSONL file to save.
-
-        DOCS: https://spacy.io/api/entityruler#to_disk
-        """
-        path = ensure_path(path)
-        cfg = {
-            "overwrite": self.overwrite,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "ent_id_sep": self.ent_id_sep,
-        }
-        serializers = {
-            "patterns": lambda p: srsly.write_jsonl(
-                p.with_suffix(".jsonl"), self.patterns
-            ),
-            "cfg": lambda p: srsly.write_json(p, cfg),
-        }
-        if path.suffix == ".jsonl":  # user wants to save only JSONL
-            srsly.write_jsonl(path, self.patterns)
-        else:
-            to_disk(path, serializers, {})
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,7 +1,9 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict, Callable
+from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 from itertools import islice

 from ..tokens.doc cimport Doc
@ -13,7 +15,7 @@ from ..symbols import POS
 from ..language import Language
 from ..errors import Errors
 from .pipe import deserialize_config
-from .tagger import Tagger
+from .tagger import ActivationsT, Tagger
 from .. import util
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
@ -52,7 +54,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "save_activations": False,
+    },
    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@ -62,8 +70,10 @@ def make_morphologizer(
    overwrite: bool,
    extend: bool,
    scorer: Optional[Callable],
+    save_activations: bool,
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+                         save_activations=save_activations)


 def morphologizer_score(examples, **kwargs):
@ -95,6 +105,7 @@ class Morphologizer(Tagger):
        overwrite: bool = BACKWARD_OVERWRITE,
        extend: bool = BACKWARD_EXTEND,
        scorer: Optional[Callable] = morphologizer_score,
+        save_activations: bool = False,
    ):
        """Initialize a morphologizer.

@ -105,6 +116,7 @@ class Morphologizer(Tagger):
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attributes "pos" and "morph" and
            Scorer.score_token_attr_per_feat for the attribute "morph".
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/morphologizer#init
        """
@ -124,11 +136,12 @@ class Morphologizer(Tagger):
        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
-        return tuple(self.cfg["labels_morph"].keys())
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
+        return self.cfg["labels_morph"].keys()

    @property
    def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
@ -151,7 +164,7 @@ class Morphologizer(Tagger):
        # normalize label
        norm_label = self.vocab.morphology.normalize_features(label)
        # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
        pos = label_dict.get(self.POS_FEAT, "")
        if self.POS_FEAT in label_dict:
            label_dict.pop(self.POS_FEAT)
@ -189,7 +202,7 @@ class Morphologizer(Tagger):
                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        morph_dict[self.POS_FEAT] = pos
                    norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -206,7 +219,7 @@ class Morphologizer(Tagger):
            for i, token in enumerate(example.reference):
                pos = token.pos_
                morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                if pos:
                    morph_dict[self.POS_FEAT] = pos
                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -217,40 +230,48 @@ class Morphologizer(Tagger):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.

        DOCS: https://spacy.io/api/morphologizer#set_annotations
        """
+        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
        cdef bint overwrite = self.cfg["overwrite"]
        cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
+        # We require random access for the upcoming ops, so we need
+        # to allocate a compatible container out of the iterable.
+        labels = tuple(self.labels)
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                # set morph
                if doc.c[j].morph == 0 or overwrite or extend:
                    if overwrite and extend:
                        # morphologizer morph overwrites any existing features
                        # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    elif extend:
                        # existing features are preserved and any new features
                        # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    else:
                        # clobber
@ -270,7 +291,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#get_loss
        """
        validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
        truths = []
        for eg in examples:
            eg_truths = []
@ -291,7 +312,7 @@ class Morphologizer(Tagger):
                    label = None
                # Otherwise, generate the combined label
                else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -1,221 +0,0 @@
-# cython: infer_types=True, profile=True, binding=True
-from typing import Optional
-import numpy
-from thinc.api import CosineDistance, to_categorical, Model, Config
-from thinc.api import set_dropout_rate
-
-from ..tokens.doc cimport Doc
-
-from .trainable_pipe import TrainablePipe
-from .tagger import Tagger
-from ..training import validate_examples
-from ..language import Language
-from ._parser_internals import nonproj
-from ..attrs import POS, ID
-from ..errors import Errors
-
-
-default_model_config = """
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v2"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-"""
-DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "nn_labeller",
-    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
-)
-def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
-    return MultitaskObjective(nlp.vocab, model, name)
-
-
-class MultitaskObjective(Tagger):
-    """Experimental: Assist training of a parser or tagger, by training a
-    side-objective.
-    """
-
-    def __init__(self, vocab, model, name="nn_labeller", *, target):
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        if target == "dep":
-            self.make_label = self.make_dep
-        elif target == "tag":
-            self.make_label = self.make_tag
-        elif target == "ent":
-            self.make_label = self.make_ent
-        elif target == "dep_tag_offset":
-            self.make_label = self.make_dep_tag_offset
-        elif target == "ent_tag":
-            self.make_label = self.make_ent_tag
-        elif target == "sent_start":
-            self.make_label = self.make_sent_start
-        elif hasattr(target, "__call__"):
-            self.make_label = target
-        else:
-            raise ValueError(Errors.E016)
-        cfg = {"labels": {}, "target": target}
-        self.cfg = dict(cfg)
-
-    @property
-    def labels(self):
-        return self.cfg.setdefault("labels", {})
-
-    @labels.setter
-    def labels(self, value):
-        self.cfg["labels"] = value
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None, labels=None):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-            raise ValueError(err)
-        if labels is not None:
-            self.labels = labels
-        else:
-            for example in get_examples():
-                for token in example.y:
-                    label = self.make_label(token)
-                    if label is not None and label not in self.labels:
-                        self.labels[label] = len(self.labels)
-        self.model.initialize()   # TODO: fix initialization by defining X and Y
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        scores = self.model.get_ref("softmax")(tokvecs)
-        return tokvecs, scores
-
-    def get_loss(self, examples, scores):
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        docs = [eg.predicted for eg in examples]
-        for i, eg in enumerate(examples):
-            # Handles alignment for tokenization differences
-            doc_annots = eg.get_aligned()  # TODO
-            for j in range(len(eg.predicted)):
-                tok_annots = {key: values[j] for key, values in tok_annots.items()}
-                label = self.make_label(j, tok_annots)
-                if label is None or label not in self.labels:
-                    correct[idx] = guesses[idx]
-                else:
-                    correct[idx] = self.labels[label]
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        loss = (d_scores**2).sum()
-        return float(loss), d_scores
-
-    @staticmethod
-    def make_dep(token):
-        return token.dep_
-
-    @staticmethod
-    def make_tag(token):
-        return token.tag_
-
-    @staticmethod
-    def make_ent(token):
-        if token.ent_iob_ == "O":
-            return "O"
-        else:
-            return token.ent_iob_ + "-" + token.ent_type_
-
-    @staticmethod
-    def make_dep_tag_offset(token):
-        dep = token.dep_
-        tag = token.tag_
-        offset = token.head.i - token.i
-        offset = min(offset, 2)
-        offset = max(offset, -2)
-        return f"{dep}-{tag}:{offset}"
-
-    @staticmethod
-    def make_ent_tag(token):
-        if token.ent_iob_ == "O":
-            ent = "O"
-        else:
-            ent = token.ent_iob_ + "-" + token.ent_type_
-        tag = token.tag_
-        return f"{tag}-{ent}"
-
-    @staticmethod
-    def make_sent_start(token):
-        """A multi-task objective for representing sentence boundaries,
-        using BILU scheme. (O is impossible)
-        """
-        if token.is_sent_start and token.is_sent_end:
-            return "U-SENT"
-        elif token.is_sent_start:
-            return "B-SENT"
-        else:
-            return "I-SENT"
-
-
-class ClozeMultitask(TrainablePipe):
-    def __init__(self, vocab, model, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None):
-        self.model.initialize()  # TODO: fix initialization by defining X and Y
-        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.initialize(X)
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        vectors = self.model.get_ref("output_layer")(tokvecs)
-        return tokvecs, vectors
-
-    def get_loss(self, examples, vectors, prediction):
-        validate_examples(examples, "ClozeMultitask.get_loss")
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
-        target = vectors[ids]
-        gradient = self.distance.get_grad(prediction, target)
-        loss = self.distance.get_loss(prediction, target)
-        return float(loss), gradient
-
-    def update(self, examples, *, drop=0., sgd=None, losses=None):
-        pass
-
-    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
-        set_dropout_rate(self.model, drop)
-        validate_examples(examples, "ClozeMultitask.rehearse")
-        docs = [eg.predicted for eg in examples]
-        predictions, bp_predictions = self.model.begin_update()
-        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions)
-        if sgd is not None:
-            self.finish_update(sgd)
-        if losses is not None:
-            losses[self.name] += loss
-        return losses
-
-    def add_label(self, label):
-        raise NotImplementedError
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config

 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
-from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
 from ..language import Language
 from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
 from ..util import registry
 from ..training import remove_bilu_prefix


 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true

 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
        "incorrect_spans_key": None,
        "scorer": {"@scorers": "spacy.ner_scorer.v1"},
    },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
    nlp: Language,
@ -98,6 +102,7 @@ def make_ner(
        scorer=scorer,
    )

+
@Language.factory(
    "beam_ner",
    assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@ -111,7 +116,12 @@ def make_ner(
        "incorrect_spans_key": None,
        "scorer": None,
    },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
    nlp: Language,
@ -185,11 +195,12 @@ def make_ner_scorer():
    return ner_score


-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
    """Pipeline component for named entity recognition.

    DOCS: https://spacy.io/api/entityrecognizer
    """
+
    TransitionSystem = BiluoPushDown

    def __init__(
@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser):
        incorrect_spans_key=None,
        scorer=ner_score,
    ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
        super().__init__(
            vocab,
            model,
            name,
            moves,
            update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
            learn_tokens=False,  # not relevant for NER
            beam_width=beam_width,
            beam_density=beam_density,
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -19,13 +19,6 @@ cdef class Pipe:
    DOCS: https://spacy.io/api/pipe
    """

-    @classmethod
-    def __init_subclass__(cls, **kwargs):
-        """Raise a warning if an inheriting class implements 'begin_training'
-         (from v2) instead of the new 'initialize' method (from v3)"""
-        if hasattr(cls, "begin_training"):
-            warnings.warn(Warnings.W088.format(name=cls.__name__))
-
    def __call__(self, Doc doc) -> Doc:
        """Apply the pipe to one document. The document is modified in place,
        and returned. This usually happens under the hood when the nlp object
@ -94,6 +87,10 @@ cdef class Pipe:
            return self.scorer(examples, **scorer_kwargs)
        return {}

+    @property
+    def is_distillable(self) -> bool:
+        return False
+
    @property
    def is_trainable(self) -> bool:
        return False
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,13 +1,16 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Callable
+from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice

 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+
+from thinc.types import Floats2d, Ints1d

 from ..tokens.doc cimport Doc

-from .tagger import Tagger
+from .tagger import ActivationsT, Tagger
 from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
@ -38,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "senter",
    assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_SENTER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+        "save_activations": False,
+    },
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+def make_senter(nlp: Language,
+                name: str,
+                model: Model,
+                overwrite: bool,
+                scorer: Optional[Callable],
+                save_activations: bool):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)


 def senter_score(examples, **kwargs):
@ -72,6 +85,7 @@ class SentenceRecognizer(Tagger):
        *,
        overwrite=BACKWARD_OVERWRITE,
        scorer=senter_score,
+        save_activations: bool = False,
    ):
        """Initialize a sentence recognizer.

@ -81,6 +95,7 @@ class SentenceRecognizer(Tagger):
            losses during training.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/sentencerecognizer#init
        """
@ -90,6 +105,7 @@ class SentenceRecognizer(Tagger):
        self._rehearsal_model = None
        self.cfg = {"overwrite": overwrite}
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def labels(self):
@ -107,19 +123,24 @@ class SentenceRecognizer(Tagger):
    def label_data(self):
        return None

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.

        DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
        """
+        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
@ -142,7 +163,7 @@ class SentenceRecognizer(Tagger):
        """
        validate_examples(examples, "SentenceRecognizer.get_loss")
        labels = self.labels
-        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
        truths = []
        for eg in examples:
            eg_truth = []
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@ -11,7 +11,7 @@ from ..language import Language
 from ..errors import Errors, Warnings
 from ..util import ensure_path, SimpleFrozenList, registry
 from ..tokens import Doc, Span
-from ..scorer import Scorer
+from ..scorer import Scorer, get_ner_prf
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
 from .. import util
@ -21,7 +21,7 @@ DEFAULT_SPANS_KEY = "ruler"


@Language.factory(
-    "future_entity_ruler",
+    "entity_ruler",
    assigns=["doc.ents"],
    default_config={
        "phrase_matcher_attr": None,
@ -67,6 +67,15 @@ def make_entity_ruler(
    )


+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
@Language.factory(
    "span_ruler",
    assigns=["doc.spans"],
@ -124,7 +133,7 @@ def prioritize_new_ents_filter(
 ) -> List[Span]:
    """Merge entities and spans into one list without overlaps by allowing
    spans to overwrite any entities that they overlap with. Intended to
-    replicate the overwrite_ents=True behavior from the EntityRuler.
+    replicate the overwrite_ents=True behavior from the v3 EntityRuler.

    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
@ -155,7 +164,7 @@ def prioritize_existing_ents_filter(
 ) -> List[Span]:
    """Merge entities and spans into one list without overlaps by prioritizing
    existing entities. Intended to replicate the overwrite_ents=False behavior
-    from the EntityRuler.
+    from the v3 EntityRuler.

    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,4 +1,5 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d
@ -16,6 +17,9 @@ from ..errors import Errors
 from ..util import registry


+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
 spancat_default_config = """
 [model]
@architectures = "spacy.SpanCategorizer.v1"
@ -106,6 +110,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
        "model": DEFAULT_SPANCAT_MODEL,
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+        "save_activations": False,
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -118,6 +123,7 @@ def make_spancat(
    scorer: Optional[Callable],
    threshold: float,
    max_positive: Optional[int],
+    save_activations: bool,
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component. The span categorizer consists of two
    parts: a suggester function that proposes candidate spans, and a labeller
@ -141,6 +147,7 @@ def make_spancat(
        0.5.
    max_positive (Optional[int]): Maximum number of labels to consider positive
        per span. Defaults to None, indicating no limit.
+        save_activations (bool): save model activations in Doc when annotating.
    """
    return SpanCategorizer(
        nlp.vocab,
@ -151,6 +158,7 @@ def make_spancat(
        max_positive=max_positive,
        name=name,
        scorer=scorer,
+        save_activations=save_activations,
    )


@ -189,6 +197,7 @@ class SpanCategorizer(TrainablePipe):
        threshold: float = 0.5,
        max_positive: Optional[int] = None,
        scorer: Optional[Callable] = spancat_score,
+        save_activations: bool = False,
    ) -> None:
        """Initialize the span categorizer.
        vocab (Vocab): The shared vocabulary.
@ -221,6 +230,7 @@ class SpanCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def key(self) -> str:
@ -263,7 +273,7 @@ class SpanCategorizer(TrainablePipe):
        """
        return list(self.labels)

-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
@ -276,7 +286,7 @@ class SpanCategorizer(TrainablePipe):
            scores = self.model.ops.alloc2f(0, 0)
        else:
            scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        return {"indices": indices, "scores": scores}

    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@ -296,19 +306,29 @@ class SpanCategorizer(TrainablePipe):
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])

-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to set, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.

        DOCS: https://spacy.io/api/spancategorizer#set_annotations
        """
        labels = self.labels
-        indices, scores = indices_scores
+
+        indices = activations["indices"]
+        assert isinstance(indices, Ragged)
+        scores = cast(Floats2d, activations["scores"])
+
        offset = 0
        for i, doc in enumerate(docs):
            indices_i = indices[i].dataXd
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["indices"] = indices_i
+                doc.activations[self.name]["scores"] = scores[
+                    offset : offset + indices.lengths[i]
+                ]
            doc.spans[self.key] = self._make_span_group(
                doc, indices_i, scores[offset : offset + indices.lengths[i]], labels  # type: ignore[arg-type]
            )
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,9 +1,11 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
-from thinc.types import Floats2d
+from thinc.api import Model, set_dropout_rate, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice

@ -22,6 +24,9 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .. import util

+
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
 # See #9050
 BACKWARD_OVERWRITE = False

@ -45,7 +50,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"},
+    default_config={
+        "model": DEFAULT_TAGGER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+        "neg_prefix": "!",
+        "save_activations": False,
+    },
    default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@ -55,6 +66,7 @@ def make_tagger(
    overwrite: bool,
    scorer: Optional[Callable],
    neg_prefix: str,
+    save_activations: bool,
 ):
    """Construct a part-of-speech tagger component.

@ -63,7 +75,8 @@ def make_tagger(
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+                  save_activations=save_activations)


 def tagger_score(examples, **kwargs):
@ -89,6 +102,7 @@ class Tagger(TrainablePipe):
        overwrite=BACKWARD_OVERWRITE,
        scorer=tagger_score,
        neg_prefix="!",
+        save_activations: bool = False,
    ):
        """Initialize a part-of-speech tagger.

@ -98,6 +112,7 @@ class Tagger(TrainablePipe):
            losses during training.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "tag".
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/tagger#init
        """
@ -108,6 +123,7 @@ class Tagger(TrainablePipe):
        cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def labels(self):
@ -126,7 +142,7 @@ class Tagger(TrainablePipe):
        """Data about the labels currently added to the component."""
        return tuple(self.cfg["labels"])

-    def predict(self, docs):
+    def predict(self, docs) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
@ -139,12 +155,12 @@ class Tagger(TrainablePipe):
            n_labels = len(self.labels)
            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
            assert len(guesses) == len(docs)
-            return guesses
+            return {"probabilities": guesses, "label_ids": guesses}
        scores = self.model.predict(docs)
        assert len(scores) == len(docs), (len(scores), len(docs))
        guesses = self._scores2guesses(scores)
        assert len(guesses) == len(docs)
-        return guesses
+        return {"probabilities": scores, "label_ids": guesses}

    def _scores2guesses(self, scores):
        guesses = []
@ -155,14 +171,15 @@ class Tagger(TrainablePipe):
            guesses.append(doc_guesses)
        return guesses

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.

        DOCS: https://spacy.io/api/tagger#set_annotations
        """
+        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
@ -170,6 +187,10 @@ class Tagger(TrainablePipe):
        cdef bint overwrite = self.cfg["overwrite"]
        labels = self.labels
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
@ -225,7 +246,6 @@ class Tagger(TrainablePipe):

        DOCS: https://spacy.io/api/tagger#rehearse
        """
-        loss_func = SequenceCategoricalCrossentropy()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -239,12 +259,32 @@ class Tagger(TrainablePipe):
        set_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update(docs)
        tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
        bp_tag_scores(grads)
-        self.finish_update(sgd)
+        if sgd is not None:
+            self.finish_update(sgd)
        losses[self.name] += loss
        return losses

+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
@ -256,7 +296,7 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
+        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
        # Convert empty tag "" to missing value None so that both misaligned
        # tokens and tokens with missing annotation have the default missing
        # value None.
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,4 +1,4 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
 from thinc.types import Floats2d
 import numpy
@ -14,6 +14,9 @@ from ..util import registry
 from ..vocab import Vocab


+ActivationsT = Dict[str, Floats2d]
+
+
 single_label_default_config = """
 [model]
@architectures = "spacy.TextCatEnsemble.v2"
@ -75,6 +78,7 @@ subword_features = true
        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
+        "save_activations": False,
    },
    default_score_weights={
        "cats_score": 1.0,
@ -95,6 +99,7 @@ def make_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -104,8 +109,16 @@ def make_textcat(
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
    scorer (Optional[Callable]): The scoring method.
+    save_activations (bool): save model activations in Doc when annotating.
    """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+    return TextCategorizer(
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
+    )


 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@ -136,6 +149,7 @@ class TextCategorizer(TrainablePipe):
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_score,
+        save_activations: bool = False,
    ) -> None:
        """Initialize a text categorizer for single-label classification.

@ -161,6 +175,7 @@ class TextCategorizer(TrainablePipe):
        }
        self.cfg = dict(cfg)
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def support_missing_values(self):
@ -185,7 +200,7 @@ class TextCategorizer(TrainablePipe):
        """
        return self.labels  # type: ignore[return-value]

-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
@ -198,12 +213,12 @@ class TextCategorizer(TrainablePipe):
            tensors = [doc.tensor for doc in docs]
            xp = self.model.ops.xp
            scores = xp.zeros((len(list(docs)), len(self.labels)))
-            return scores
+            return {"probabilities": scores}
        scores = self.model.predict(docs)
        scores = self.model.ops.asarray(scores)
-        return scores
+        return {"probabilities": scores}

-    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
@ -211,9 +226,13 @@ class TextCategorizer(TrainablePipe):

        DOCS: https://spacy.io/api/textcategorizer#set_annotations
        """
+        probs = activations["probabilities"]
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["probabilities"] = probs[i]
            for j, label in enumerate(self.labels):
-                doc.cats[label] = float(scores[i, j])
+                doc.cats[label] = float(probs[i, j])

    def update(
        self,
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -1,4 +1,4 @@
-from typing import Iterable, Optional, Dict, List, Callable, Any
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
 from thinc.types import Floats2d
 from thinc.api import Model, Config

@ -75,6 +75,7 @@ subword_features = true
        "threshold": 0.5,
        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
+        "save_activations": False,
    },
    default_score_weights={
        "cats_score": 1.0,
@ -95,8 +96,9 @@ def make_multilabel_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "MultiLabel_TextCategorizer":
-    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
    to be non-mutually exclusive, which means that there can be zero or more labels
    per doc).
@ -107,7 +109,12 @@ def make_multilabel_textcat(
    scorer (Optional[Callable]): The scoring method.
    """
    return MultiLabel_TextCategorizer(
-        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
    )


@ -139,6 +146,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_multilabel_score,
+        save_activations: bool = False,
    ) -> None:
        """Initialize a text categorizer for multi-label classification.

@ -148,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
        scorer (Optional[Callable]): The scoring method.
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/textcategorizer#init
        """
@ -158,6 +167,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        cfg = {"labels": [], "threshold": threshold}
        self.cfg = dict(cfg)
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def support_missing_values(self):
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@ -6,3 +6,4 @@ cdef class TrainablePipe(Pipe):
    cdef public object model
    cdef public object cfg
    cdef public object scorer
+    cdef bint _save_activations
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -2,11 +2,12 @@
 from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable
 import srsly
 from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings

 from ..tokens.doc cimport Doc

-from ..training import validate_examples
-from ..errors import Errors
+from ..training import validate_examples, validate_distillation_examples
+from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
 from ..vocab import Vocab
@ -55,6 +56,53 @@ cdef class TrainablePipe(Pipe):
        except Exception as e:
            error_handler(self.name, self, [doc], e)

+
+    def distill(self,
+               teacher_pipe: Optional["TrainablePipe"],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is typically trained on the probability
+        distribution of the teacher, but details may differ per pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/pipe#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        validate_distillation_examples(examples, "TrainablePipe.distill")
+        set_dropout_rate(self.model, drop)
+        for node in teacher_pipe.model.walk():
+            if node.name == "softmax":
+                node.attrs["softmax_normalize"] = True
+        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        bp_student_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
@ -168,6 +216,19 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))

+    def get_teacher_student_loss(self, teacher_scores, student_scores):
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
+
    def create_optimizer(self) -> Optimizer:
        """Create an optimizer for the pipeline component.

@ -204,6 +265,14 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))

+    @property
+    def is_distillable(self) -> bool:
+        # Normally a pipe overrides `get_teacher_student_loss` to implement
+        # distillation. In more exceptional cases, a pipe can provide its
+        # own `distill` implementation. If neither of these methods is
+        # overridden, the pipe does not implement distillation.
+        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
+
    @property
    def is_trainable(self) -> bool:
        return True
@ -342,3 +411,11 @@ cdef class TrainablePipe(Pipe):
        deserialize["model"] = load_model
        util.from_disk(path, deserialize, exclude)
        return self
+
+    @property
+    def save_activations(self):
+        return self._save_activations
+
+    @save_activations.setter
+    def save_activations(self, save_activations: bool):
+        self._save_activations = save_activations
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@ -1,20 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.backends.cblas cimport CBlas
-
-from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
-
-
-cdef class Parser(TrainablePipe):
-    cdef public object _rehearsal_model
-    cdef readonly TransitionSystem moves
-    cdef public object _multitasks
-
-    cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
-
-    cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -1,5 +1,6 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 from __future__ import print_function
+from typing import Dict, Iterable, List, Optional, Tuple
 from cymem.cymem cimport Pool
 cimport numpy as np
 from itertools import islice
@ -7,25 +8,30 @@ from libcpp.vector cimport vector
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free
 import random
+import contextlib

 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
-from thinc.extra.search cimport Beam
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops, get_array_module
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 import numpy.random
 import numpy
 import warnings

-from ._parser_internals.stateclass cimport StateClass
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateC, StateClass
+from ._parser_internals.search cimport Beam
 from ..tokens.doc cimport Doc
-from .trainable_pipe import TrainablePipe
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
+from ..vocab cimport Vocab
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ..typedefs cimport weight_t

 from ..training import validate_examples, validate_get_examples
+from ..training import validate_distillation_examples
 from ..errors import Errors, Warnings
 from .. import util

@ -33,7 +39,7 @@ from .. import util
 NUMPY_OPS = NumpyOps()


-cdef class Parser(TrainablePipe):
+class Parser(TrainablePipe):
    """
    Base class of the DependencyParser and EntityRecognizer.
    """
@ -123,6 +129,7 @@ cdef class Parser(TrainablePipe):

        self._rehearsal_model = None
        self.scorer = scorer
+        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops

    def __getnewargs_ex__(self):
        """This allows pickling the Parser and its keyword-only init arguments"""
@ -132,8 +139,9 @@ cdef class Parser(TrainablePipe):
    @property
    def move_names(self):
        names = []
+        cdef TransitionSystem moves = self.moves
        for i in range(self.moves.n_moves):
-            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
            # Explicitly removing the internal "U-" token used for blocking entities
            if name != "U-":
                names.append(name)
@ -202,6 +210,118 @@ cdef class Parser(TrainablePipe):
        # Defined in subclasses, to avoid circular import
        raise NotImplementedError

+    def distill(self,
+               teacher_pipe: Optional[TrainablePipe],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None):
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is trained on the transition probabilities
+        of the teacher.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/dependencyparser#distill
+        """
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        validate_distillation_examples(examples, "TransitionParser.distill")
+
+        set_dropout_rate(self.model, drop)
+
+        student_docs = [eg.predicted for eg in examples]
+
+        max_moves = self.cfg["update_with_oracle_cut_size"]
+        if max_moves >= 1:
+            # Chop sequences into lengths of this many words, to make the
+            # batch uniform length. Since we do not have a gold standard
+            # sequence, we use the teacher's predictions as the gold
+            # standard.
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+        else:
+            states = self.moves.init_batch(student_docs)
+
+        # We distill as follows: 1. we first let the student predict transition
+        # sequences (and the corresponding transition probabilities); (2) we
+        # let the teacher follow the student's predicted transition sequences
+        # to obtain the teacher's transition probabilities; (3) we compute the
+        # gradients of the student's transition distributions relative to the
+        # teacher's distributions.
+
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
+            moves=self.moves, actions=actions)
+        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        backprop_scores((student_states, d_scores))
+
+        if sgd is not None:
+            self.finish_update(sgd)
+
+        losses[self.name] += loss
+
+        return losses
+
+
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+        normalize: bool=False,
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
+        """
+
+        # We can't easily hook up a softmax layer in the parsing model, since
+        # the get_loss does additional masking. So, we could apply softmax
+        # manually here and use Thinc's cross-entropy loss. But it's a bit
+        # suboptimal, since we can have a lot of states that would result in
+        # many kernel launches. Futhermore the parsing model's backprop expects
+        # a XP array, so we'd have to concat the softmaxes anyway. So, like
+        # the get_loss implementation, we'll compute the loss and gradients
+        # ourselves.
+
+        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
+            axis=-1, inplace=True)
+        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
+            axis=-1, inplace=True)
+
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = student_scores - teacher_scores
+        if normalize:
+            d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum() / d_scores.size
+
+        return float(loss), d_scores
+
    def init_multitask_objectives(self, get_examples, pipeline, **cfg):
        """Setup models for secondary objectives, to benefit from multi-task
        learning. This method is intended to be overridden by subclasses.
@ -222,9 +342,6 @@ cdef class Parser(TrainablePipe):

        stream: The sequence of documents to process.
        batch_size (int): Number of documents to accumulate into a working set.
-        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
-            deals with a failing batch of documents. The default function just reraises
-            the exception.

        YIELDS (Doc): Documents, in order.
        """
@ -246,83 +363,29 @@ cdef class Parser(TrainablePipe):
    def predict(self, docs):
        if isinstance(docs, Doc):
            docs = [docs]
+        self._ensure_labels_are_added(docs)
        if not any(len(doc) for doc in docs):
            result = self.moves.init_batch(docs)
            return result
-        if self.cfg["beam_width"] == 1:
-            return self.greedy_parse(docs, drop=0.0)
-        else:
-            return self.beam_parse(
-                docs,
-                drop=0.0,
-                beam_width=self.cfg["beam_width"],
-                beam_density=self.cfg["beam_density"]
-            )
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states_or_beams, _ = self.model.predict(inputs)
+        return states_or_beams

    def greedy_parse(self, docs, drop=0.):
-        cdef vector[StateC*] states
-        cdef StateClass state
-        ops = self.model.ops
-        cdef CBlas cblas
-        if isinstance(ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = ops.cblas()
+        self._resize()
        self._ensure_labels_are_added(docs)
-        set_dropout_rate(self.model, drop)
-        batch = self.moves.init_batch(docs)
-        model = self.model.predict(docs)
-        weights = get_c_weights(model)
-        for state in batch:
-            if not state.is_final():
-                states.push_back(state.c)
-        sizes = get_c_sizes(model, states.size())
-        with nogil:
-            self._parseC(cblas, &states[0], weights, sizes)
-        model.clear_memory()
-        del model
-        return batch
+        with _change_attrs(self.model, beam_width=1):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states, _ = self.model.predict(inputs)
+        return states

    def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
        self._ensure_labels_are_added(docs)
-        batch = _beam_utils.BeamBatch(
-            self.moves,
-            self.moves.init_batch(docs),
-            None,
-            beam_width,
-            density=beam_density
-        )
-        model = self.model.predict(docs)
-        while not batch.is_done:
-            states = batch.get_unfinished_states()
-            if not states:
-                break
-            scores = model.predict(states)
-            batch.advance(scores)
-        model.clear_memory()
-        del model
-        return list(batch)
-
-    cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
-        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations = alloc_activations(sizes)
-        while sizes.states >= 1:
-            predict_states(cblas, &activations, states, &weights, sizes)
-            # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-            sizes.states = unfinished.size()
-            unfinished.clear()
-        free_activations(&activations)
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            beams, _ = self.model.predict(inputs)
+        return beams

    def set_annotations(self, docs, states_or_beams):
        cdef StateClass state
@ -334,35 +397,6 @@ cdef class Parser(TrainablePipe):
            for hook in self.postprocesses:
                hook(doc)

-    def transition_states(self, states, float[:, ::1] scores):
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        with gil:
-            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
-        cdef int i, guess
-        cdef Transition action
-        for i in range(batch_size):
-            self.moves.set_valid(is_valid, states[i])
-            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-            if guess == -1:
-                # This shouldn't happen, but it's hard to raise an error here,
-                # and we don't want to infinite loop. So, force to end state.
-                states[i].force_final()
-            else:
-                action = self.moves.c[guess]
-                action.do(states[i], action.label)
-        free(is_valid)
-
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        cdef StateClass state
        if losses is None:
@ -374,67 +408,99 @@ cdef class Parser(TrainablePipe):
        )
        for multitask in self._multitasks:
            multitask.update(examples, drop=drop, sgd=sgd)
+        # We need to take care to act on the whole batch, because we might be
+        # getting vectors via a listener.
        n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
        if n_examples == 0:
            return losses
        set_dropout_rate(self.model, drop)
-        # The probability we use beam update, instead of falling back to
-        # a greedy update
-        beam_update_prob = self.cfg["beam_update_prob"]
-        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
-            return self.update_beam(
-                examples,
-                beam_width=self.cfg["beam_width"],
-                sgd=sgd,
-                losses=losses,
-                beam_density=self.cfg["beam_density"]
-            )
+        docs = [eg.x for eg in examples if len(eg.x)]
+
        max_moves = self.cfg["update_with_oracle_cut_size"]
        if max_moves >= 1:
            # Chop sequences into lengths of this many words, to make the
            # batch uniform length.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            init_states, gold_states, _ = self._init_gold_batch(
                examples,
                max_length=max_moves
            )
        else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
-        states_golds = list(zip(states, golds))
-        n_moves = 0
-        while states_golds:
-            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states)
-            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            # Note that the gradient isn't normalized by the batch size
-            # here, because our "samples" are really the states...But we
-            # can't normalize by the number of states either, as then we'd
-            # be getting smaller gradients for states in long sequences.
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+            init_states, gold_states, _ = self.moves.init_gold_batch(examples)

-        backprop_tok2vec(golds)
+        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+            max_moves=max_moves, states=[state.copy() for state in init_states])
+        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
+        if sum(s.shape[0] for s in scores) == 0:
+            return losses
+        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
+            examples, max_moves)
+        backprop_scores((pred_states, d_scores))
        if sgd not in (None, False):
            self.finish_update(sgd)
+        losses[self.name] += float((d_scores**2).sum())
        # Ugh, this is annoying. If we're working on GPU, we want to free the
        # memory ASAP. It seems that Python doesn't necessarily get around to
        # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        del model
+        del backprop_scores
        return losses

+    def get_loss(self, states_scores, examples, max_moves):
+        gold_states, init_states, pred_states, scores = states_scores
+        scores = self.model.ops.xp.vstack(scores)
+        costs = self._get_costs_from_histories(
+            examples,
+            gold_states,
+            init_states,
+            [list(state.history) for state in pred_states],
+            max_moves
+        )
+        xp = get_array_module(scores)
+        best_costs = costs.min(axis=1, keepdims=True)
+        gscores = scores.copy()
+        min_score = scores.min() - 1000
+        assert costs.shape == scores.shape, (costs.shape, scores.shape)
+        gscores[costs > best_costs] = min_score
+        max_ = scores.max(axis=1, keepdims=True)
+        gmax = gscores.max(axis=1, keepdims=True)
+        exp_scores = xp.exp(scores - max_)
+        exp_gscores = xp.exp(gscores - gmax)
+        Z = exp_scores.sum(axis=1, keepdims=True)
+        gZ = exp_gscores.sum(axis=1, keepdims=True)
+        d_scores = exp_scores / Z
+        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
+        return d_scores
+
+    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
+        cdef TransitionSystem moves = self.moves
+        cdef StateClass state
+        cdef int clas
+        cdef int nF = self.model.get_dim("nF")
+        cdef int nO = moves.n_moves
+        cdef int nS = sum([len(history) for history in histories])
+        cdef Pool mem = Pool()
+        cdef np.ndarray costs_i
+        is_valid = <int*>mem.alloc(nO, sizeof(int))
+        batch = list(zip(init_states, histories, gold_states))
+        n_moves = 0
+        output = []
+        while batch:
+            costs = numpy.zeros((len(batch), nO), dtype="f")
+            for i, (state, history, gold) in enumerate(batch):
+                costs_i = costs[i]
+                clas = history.pop(0)
+                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
+                action = moves.c[clas]
+                action.do(state.c, action.label)
+                state.c.history.push_back(clas)
+            output.append(costs)
+            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
+            if n_moves >= max_moves >= 1:
+                break
+            n_moves += 1
+
+        return self.model.ops.xp.vstack(output)
+
    def rehearse(self, examples, sgd=None, losses=None, **cfg):
        """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
        if losses is None:
@ -444,10 +510,9 @@ cdef class Parser(TrainablePipe):
                multitask.rehearse(examples, losses=losses, sgd=sgd)
        if self._rehearsal_model is None:
            return None
-        losses.setdefault(self.name, 0.)
+        losses.setdefault(self.name, 0.0)
        validate_examples(examples, "Parser.rehearse")
        docs = [eg.predicted for eg in examples]
-        states = self.moves.init_batch(docs)
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
@ -455,85 +520,33 @@ cdef class Parser(TrainablePipe):
        # Prepare the stepwise model, and get the callback for finishing the batch
        set_dropout_rate(self._rehearsal_model, 0.0)
        set_dropout_rate(self.model, 0.0)
-        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, backprop_tok2vec = self.model.begin_update(docs)
-        n_scores = 0.
-        loss = 0.
-        while states:
-            targets, _ = tutor.begin_update(states)
-            guesses, backprop = model.begin_update(states)
-            d_scores = (guesses - targets) / targets.shape[0]
-            # If all weights for an output are 0 in the original model, don't
-            # supervise that output. This allows us to add classes.
-            loss += (d_scores**2).sum()
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, guesses)
-            states = [state for state in states if not state.is_final()]
-            n_scores += d_scores.size
-        # Do the backprop
-        backprop_tok2vec(docs)
+        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
+        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
+
+        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
+        student_scores = self.model.ops.xp.vstack(student_scores)
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
+        # If all weights for an output are 0 in the original model, don't
+        # supervise that output. This allows us to add classes.
+        loss = (d_scores**2).sum() / d_scores.size
+        backprop_scores((student_states, d_scores))
+
        if sgd is not None:
            self.finish_update(sgd)
-        losses[self.name] += loss / n_scores
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        tutor.clear_memory()
-        del model
-        del tutor
+        losses[self.name] += loss
+
        return losses

    def update_beam(self, examples, *, beam_width,
            drop=0., sgd=None, losses=None, beam_density=0.0):
-        states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, backprop_tok2vec = self.model.begin_update(
-            [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
-            self.moves,
-            states,
-            golds,
-            model,
-            beam_width,
-            beam_density=beam_density,
-        )
-        losses[self.name] += loss
-        backprop_tok2vec(golds)
-        if sgd is not None:
-            self.finish_update(sgd)
-
-    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
-        cdef StateClass state
-        cdef Pool mem = Pool()
-        cdef int i
-
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
-        c_d_scores = <float*>d_scores.data
-        unseen_classes = self.model.attrs["unseen_classes"]
-        for i, (state, gold) in enumerate(zip(states, golds)):
-            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
-            memset(costs, 0, self.moves.n_moves * sizeof(float))
-            self.moves.set_costs(is_valid, costs, state.c, gold)
-            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in unseen_classes:
-                    unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
-            c_d_scores += d_scores.shape[1]
-        # Note that we don't normalize this. See comment in update() for why.
-        if losses is not None:
-            losses.setdefault(self.name, 0.)
-            losses[self.name] += (d_scores**2).sum()
-        return d_scores
+        raise NotImplementedError

    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)
@ -572,7 +585,7 @@ cdef class Parser(TrainablePipe):
            for example in islice(get_examples(), 10):
                doc_sample.append(example.predicted)
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(doc_sample)
+        self.model.initialize((doc_sample, self.moves))
        if nlp is not None:
            self.init_multitask_objectives(get_examples, nlp.pipeline)

@ -629,28 +642,63 @@ cdef class Parser(TrainablePipe):
                    raise ValueError(Errors.E149) from None
        return self

-    def _init_gold_batch(self, examples, max_length):
-        """Make a square batch, of length equal to the shortest transition
+    def _init_batch(self, teacher_step_model, docs, max_length):
+        """Make a square batch of length equal to the shortest transition
        sequence or a cap. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
        where N is the shortest doc. We'll make two states, one representing
-        long_doc[:N], and another representing long_doc[N:]."""
+        long_doc[:N], and another representing long_doc[N:]. In contrast to
+        _init_gold_batch, this version uses a teacher model to generate the
+        cut sequences."""
        cdef:
            StateClass start_state
            StateClass state
            Transition action
-        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+        all_states = self.moves.init_batch(docs)
+        states = []
+        to_cut = []
+        for state, doc in zip(all_states, docs):
+            if not state.is_final():
+                if len(doc) < max_length:
+                    states.append(state)
+                else:
+                    to_cut.append(state)
+        while to_cut:
+            states.extend(state.copy() for state in to_cut)
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
+        return states
+
+
+    def _init_gold_batch(self, examples, max_length):
+        """Make a square batch, of length equal to the shortest transition
+        sequence or a cap. A long doc will get multiple states. Let's say we
+        have a doc of length 2*N, where N is the shortest doc. We'll make
+        two states, one representing long_doc[:N], and another representing
+        long_doc[N:]."""
+        cdef:
+            StateClass start_state
+            StateClass state
+            Transition action
+            TransitionSystem moves = self.moves
+        all_states = moves.init_batch([eg.predicted for eg in examples])
        states = []
        golds = []
        to_cut = []
        for state, eg in zip(all_states, examples):
-            if self.moves.has_gold(eg) and not state.is_final():
-                gold = self.moves.init_gold(state, eg)
+            if moves.has_gold(eg) and not state.is_final():
+                gold = moves.init_gold(state, eg)
                if len(eg.x) < max_length:
                    states.append(state)
                    golds.append(gold)
                else:
-                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                    oracle_actions = moves.get_oracle_sequence_from_state(
                        state.copy(), gold)
                    to_cut.append((eg, state, gold, oracle_actions))
        if not to_cut:
@ -660,13 +708,52 @@ cdef class Parser(TrainablePipe):
            for i in range(0, len(oracle_actions), max_length):
                start_state = state.copy()
                for clas in oracle_actions[i:i+max_length]:
-                    action = self.moves.c[clas]
+                    action = moves.c[clas]
                    action.do(state.c, action.label)
                    if state.is_final():
                        break
-                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if moves.has_gold(eg, start_state.B(0), state.B(0)):
                    states.append(start_state)
                    golds.append(gold)
                if state.is_final():
                    break
        return states, golds, max_length
+
+
+@contextlib.contextmanager
+def _change_attrs(model, **kwargs):
+    """Temporarily modify a thinc model's attributes."""
+    unset = object()
+    old_attrs = {}
+    for key, value in kwargs.items():
+        old_attrs[key] = model.attrs.get(key, unset)
+        model.attrs[key] = value
+    yield model
+    for key, value in old_attrs.items():
+        if value is unset:
+            model.attrs.pop(key)
+        else:
+            model.attrs[key] = value
+
+
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
+    cdef int step
+    cdef StateClass state
+    cdef StateC* c_state
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for state in states:
+            c_state = state.c
+            if step < c_state.history.size():
+                step_actions.append(c_state.history[step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -144,7 +144,7 @@ def validate_init_settings(

 def validate_token_pattern(obj: list) -> List[str]:
    # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
-    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
    if isinstance(obj, list):
        converted = []
        for pattern in obj:
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -1,4 +1,4 @@
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, uint32_t
 from libcpp.vector cimport vector
 from libcpp.set cimport set
 from cymem.cymem cimport Pool
@ -7,13 +7,6 @@ from murmurhash.mrmr cimport hash64

 from .typedefs cimport attr_t, hash_t

-
-cpdef hash_t hash_string(str string) except 0
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil
-
-cdef str decode_Utf8Str(const Utf8Str* string)
-
-
 ctypedef union Utf8Str:
    unsigned char[8] s
    unsigned char* p
@ -21,9 +14,13 @@ ctypedef union Utf8Str:

 cdef class StringStore:
    cdef Pool mem
+    cdef vector[hash_t] _keys
+    cdef PreshMap _map

-    cdef vector[hash_t] keys
-    cdef public PreshMap _map
+    cdef hash_t _intern_str(self, str string)
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *
+    cdef str _decode_str_repr(self, const Utf8Str* string)

-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+
+cpdef hash_t hash_string(object string) except -1
+cpdef hash_t get_string_id(object string_or_hash) except -1
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@ -1,21 +1,20 @@
-from typing import Optional, Iterable, Iterator, Union, Any, overload
+from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path

-def get_string_id(key: Union[str, int]) -> int: ...
-
 class StringStore:
-    def __init__(
-        self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
-    ) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
    @overload
-    def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ...
+    def __getitem__(self, string_or_hash: str) -> int: ...
    @overload
-    def __getitem__(self, string_or_id: int) -> str: ...
-    def as_int(self, key: Union[bytes, str, int]) -> int: ...
-    def as_string(self, key: Union[bytes, str, int]) -> str: ...
+    def __getitem__(self, string_or_hash: int) -> str: ...
+    def as_int(self, string_or_hash: Union[str, int]) -> int: ...
+    def as_string(self, string_or_hash: Union[str, int]) -> str: ...
    def add(self, string: str) -> int: ...
+    def items(self) -> List[Tuple[str, int]]: ...
+    def keys(self) -> List[str]: ...
+    def values(self) -> List[int]: ...
    def __len__(self) -> int: ...
-    def __contains__(self, string: str) -> bool: ...
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
    def __iter__(self) -> Iterator[str]: ...
    def __reduce__(self) -> Any: ...
    def to_disk(self, path: Union[str, Path]) -> None: ...
@ -23,3 +22,5 @@ class StringStore:
    def to_bytes(self, **kwargs: Any) -> bytes: ...
    def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
    def _reset_and_load(self, strings: Iterable[str]) -> None: ...
+
+def get_string_id(string_or_hash: Union[str, int]) -> int: ...
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,9 +1,10 @@
 # cython: infer_types=True
+from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
 cimport cython
 from libc.string cimport memcpy
 from libcpp.set cimport set
 from libc.stdint cimport uint32_t
-from murmurhash.mrmr cimport hash64, hash32
+from murmurhash.mrmr cimport hash64

 import srsly

@ -14,105 +15,13 @@ from .symbols import NAMES as SYMBOLS_BY_INT
 from .errors import Errors
 from . import util

-# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
-cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
-    try:
-        out_hash[0] = key
-        return True
-    except:
-        return False
-
-def get_string_id(key):
-    """Get a string ID, handling the reserved symbols correctly. If the key is
-    already an ID, return it.
-
-    This function optimises for convenience over performance, so shouldn't be
-    used in tight loops.
-    """
-    cdef hash_t str_hash    
-    if isinstance(key, str):
-        if len(key) == 0:
-            return 0
-
-        symbol = SYMBOLS_BY_STR.get(key, None)
-        if symbol is not None:
-            return symbol
-        else:
-            chars = key.encode("utf8")
-            return hash_utf8(chars, len(chars))
-    elif _try_coerce_to_hash(key, &str_hash):
-        # Coerce the integral key to the expected primitive hash type.
-        # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
-        # downsteam (as these are internally implemented as custom PyObjects 
-        # whose comparison operators can incur a significant overhead).
-        return str_hash
-    else:
-        # TODO: Raise an error instead
-        return key
-
-
-cpdef hash_t hash_string(str string) except 0:
-    chars = string.encode("utf8")
-    return hash_utf8(chars, len(chars))
-
-
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
-    return hash64(utf8_string, length, 1)
-
-
-cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
-    return hash32(utf8_string, length, 1)
-
-
-cdef str decode_Utf8Str(const Utf8Str* string):
-    cdef int i, length
-    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
-        return string.s[1:string.s[0]+1].decode("utf8")
-    elif string.p[0] < 255:
-        return string.p[1:string.p[0]+1].decode("utf8")
-    else:
-        i = 0
-        length = 0
-        while string.p[i] == 255:
-            i += 1
-            length += 255
-        length += string.p[i]
-        i += 1
-        return string.p[i:length + i].decode("utf8")
-
-
-cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
-    cdef int n_length_bytes
-    cdef int i
-    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
-    cdef uint32_t ulength = length
-    if length < sizeof(string.s):
-        string.s[0] = <unsigned char>length
-        memcpy(&string.s[1], chars, length)
-        return string
-    elif length < 255:
-        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
-        string.p[0] = length
-        memcpy(&string.p[1], chars, length)
-        return string
-    else:
-        i = 0
-        n_length_bytes = (length // 255) + 1
-        string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
-        for i in range(n_length_bytes-1):
-            string.p[i] = 255
-        string.p[n_length_bytes-1] = length % 255
-        memcpy(&string.p[n_length_bytes], chars, length)
-        return string
-

 cdef class StringStore:
-    """Look up strings by 64-bit hashes.
+    """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.

    DOCS: https://spacy.io/api/stringstore
    """
-    def __init__(self, strings=None, freeze=False):
+    def __init__(self, strings: Optional[Iterable[str]] = None):
        """Create the StringStore.

        strings (iterable): A sequence of unicode strings to add to the store.
@ -123,127 +32,126 @@ cdef class StringStore:
            for string in strings:
                self.add(string)

-    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given hash, or vice versa.
+    def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
+        """Retrieve a string from a given hash. If a string
+        is passed as the input, add it to the store and return
+        its hash.

-        string_or_id (bytes, str or uint64): The value to encode.
-        Returns (str / uint64): The value to be retrieved.
+        string_or_hash (int / str): The hash value to lookup or the string to store.
+        RETURNS (str / int): The stored string or the hash of the newly added string.
        """
-        cdef hash_t str_hash
-        cdef Utf8Str* utf8str = NULL
-
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return 0
-
-            # Return early if the string is found in the symbols LUT.
-            symbol = SYMBOLS_BY_STR.get(string_or_id, None)
-            if symbol is not None:
-                return symbol
-            else:
-                return hash_string(string_or_id)
-        elif isinstance(string_or_id, bytes):
-            return hash_utf8(string_or_id, len(string_or_id))
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            if str_hash == 0:
-                return ""
-            elif str_hash < len(SYMBOLS_BY_INT):
-                return SYMBOLS_BY_INT[str_hash]
-            else:
-                utf8str = <Utf8Str*>self._map.get(str_hash)
+        if isinstance(string_or_hash, str):
+            return self.add(string_or_hash)
        else:
-            # TODO: Raise an error instead
-            utf8str = <Utf8Str*>self._map.get(string_or_id)
+            return self._get_interned_str(string_or_hash)

-        if utf8str is NULL:
-            raise KeyError(Errors.E018.format(hash_value=string_or_id))
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool:
+        """Check whether a string or a hash is in the store.
+
+        string (str / int): The string/hash to check.
+        RETURNS (bool): Whether the store contains the string.
+        """
+        cdef hash_t str_hash = get_string_id(string_or_hash)
+        if str_hash in SYMBOLS_BY_INT:
+            return True
        else:
-            return decode_Utf8Str(utf8str)
+            return self._map.get(str_hash) is not NULL

-    def as_int(self, key):
-        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, str):
-            return key
-        else:
-            return self[key]
+    def __iter__(self) -> Iterator[str]:
+        """Iterate over the strings in the store in insertion order.

-    def as_string(self, key):
-        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, str):
-            return key
-        else:
-            return self[key]
+        RETURNS: An iterable collection of strings.
+        """
+        return iter(self.keys())

-    def add(self, string):
+    def __reduce__(self):
+        strings = list(self)
+        return (StringStore, (strings,), None, None, None)
+
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self._keys.size()
+
+    def add(self, string: str) -> int:
        """Add a string to the StringStore.

        string (str): The string to add.
        RETURNS (uint64): The string's hash value.
        """
-        cdef hash_t str_hash
-        if isinstance(string, str):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-
-            string = string.encode("utf8")
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        elif isinstance(string, bytes):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        else:
+        if not isinstance(string, str):
            raise TypeError(Errors.E017.format(value_type=type(string)))
-        return str_hash

-    def __len__(self):
-        """The number of strings in the store.
-
-        RETURNS (int): The number of strings in the store.
-        """
-        return self.keys.size()
-
-    def __contains__(self, string_or_id not None):
-        """Check whether a string or ID is in the store.
-
-        string_or_id (str or int): The string to check.
-        RETURNS (bool): Whether the store contains the string.
-        """
-        cdef hash_t str_hash
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return True
-            elif string_or_id in SYMBOLS_BY_STR:
-                return True
-            str_hash = hash_string(string_or_id)
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            pass
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
        else:
-            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            return self._intern_str(string)

-        if str_hash < len(SYMBOLS_BY_INT):
-            return True
+    def as_int(self, string_or_hash: Union[str, int]) -> str:
+        """If a hash value is passed as the input, return it as-is. If the input
+        is a string, return its corresponding hash.
+
+        string_or_hash (str / int): The string to hash or a hash value.
+        RETURNS (int): The hash of the string or the input hash value.
+        """
+        if isinstance(string_or_hash, int):
+            return string_or_hash
        else:
-            return self._map.get(str_hash) is not NULL
+            return get_string_id(string_or_hash)

-    def __iter__(self):
-        """Iterate over the strings in the store, in order.
+    def as_string(self, string_or_hash: Union[str, int]) -> str:
+        """If a string is passed as the input, return it as-is. If the input
+        is a hash value, return its corresponding string.

-        YIELDS (str): A string in the store.
+        string_or_hash (str / int): The hash value to lookup or a string.
+        RETURNS (str): The stored string or the input string.
+        """
+        if isinstance(string_or_hash, str):
+            return string_or_hash
+        else:
+            return self._get_interned_str(string_or_hash)
+
+    def items(self) -> List[Tuple[str, int]]:
+        """Iterate over the stored strings and their hashes in insertion order.
+
+        RETURNS: A list of string-hash pairs.
+        """
+        # Even though we internally store the hashes as keys and the strings as
+        # values, we invert the order in the public API to keep it consistent with
+        # the implementation of the `__iter__` method (where we wish to iterate over
+        # the strings in the store).
+        cdef int i
+        pairs = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            str_hash = self._keys[i]
+            utf8str = <Utf8Str*>self._map.get(str_hash)
+            pairs[i] = (self._decode_str_repr(utf8str), str_hash)
+        return pairs
+
+    def keys(self) -> List[str]:
+        """Iterate over the stored strings in insertion order.
+
+        RETURNS: A list of strings.
        """
        cdef int i
-        cdef hash_t key
-        for i in range(self.keys.size()):
-            key = self.keys[i]
-            utf8str = <Utf8Str*>self._map.get(key)
-            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        strings = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            utf8str = <Utf8Str*>self._map.get(self._keys[i])
+            strings[i] = self._decode_str_repr(utf8str)
+        return strings

-    def __reduce__(self):
-        strings = list(self)
-        return (StringStore, (strings,), None, None, None)
+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        return hashes

    def to_disk(self, path):
        """Save the current state to a directory.
@ -294,24 +202,122 @@ cdef class StringStore:
    def _reset_and_load(self, strings):
        self.mem = Pool()
        self._map = PreshMap()
-        self.keys.clear()
+        self._keys.clear()
        for string in strings:
            self.add(string)

-    cdef const Utf8Str* intern_unicode(self, str py_string):
-        # 0 means missing, but we don't bother offsetting the index.
-        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+    def _get_interned_str(self, hash_value: int) -> str:
+        cdef hash_t str_hash
+        if not _try_coerce_to_hash(hash_value, &str_hash):
+            raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))

-    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+        # Handle reserved symbols and empty strings correctly.
+        if str_hash == 0:
+            return ""
+
+        symbol = SYMBOLS_BY_INT.get(str_hash)
+        if symbol is not None:
+            return symbol
+
+        utf8str = <Utf8Str*>self._map.get(str_hash)
+        if utf8str is NULL:
+            raise KeyError(Errors.E018.format(hash_value=str_hash))
+        else:
+            return self._decode_str_repr(utf8str)
+
+    cdef hash_t _intern_str(self, str string):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
-        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
+        chars = string.encode('utf-8')
+        cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
        cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
        if value is not NULL:
-            return value
-        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
+            return key
+
+        value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
        self._map.set(key, value)
-        self.keys.push_back(key)
-        return value
+        self._keys.push_back(key)
+        return key
+
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
+        cdef int n_length_bytes
+        cdef int i
+        cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
+        cdef uint32_t ulength = length
+        if length < sizeof(string.s):
+            string.s[0] = <unsigned char>length
+            memcpy(&string.s[1], chars, length)
+            return string
+        elif length < 255:
+            string.p = <unsigned char*>self.mem.alloc(length + 1, sizeof(unsigned char))
+            string.p[0] = length
+            memcpy(&string.p[1], chars, length)
+            return string
+        else:
+            i = 0
+            n_length_bytes = (length // 255) + 1
+            string.p = <unsigned char*>self.mem.alloc(length + n_length_bytes, sizeof(unsigned char))
+            for i in range(n_length_bytes-1):
+                string.p[i] = 255
+            string.p[n_length_bytes-1] = length % 255
+            memcpy(&string.p[n_length_bytes], chars, length)
+            return string
+
+    cdef str _decode_str_repr(self, const Utf8Str* string):
+        cdef int i, length
+        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
+            return string.s[1:string.s[0]+1].decode('utf-8')
+        elif string.p[0] < 255:
+            return string.p[1:string.p[0]+1].decode('utf-8')
+        else:
+            i = 0
+            length = 0
+            while string.p[i] == 255:
+                i += 1
+                length += 255
+            length += string.p[i]
+            i += 1
+            return string.p[i:length + i].decode('utf-8')
+
+
+cpdef hash_t hash_string(object string) except -1:
+    if not isinstance(string, str):
+        raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string)))
+
+    # Handle reserved symbols and empty strings correctly.
+    if len(string) == 0:
+        return 0
+
+    symbol = SYMBOLS_BY_STR.get(string)
+    if symbol is not None:
+        return symbol
+
+    chars = string.encode('utf-8')
+    return hash64(<unsigned char*>chars, len(chars), 1)
+
+
+cpdef hash_t get_string_id(object string_or_hash) except -1:
+    cdef hash_t str_hash
+
+    try:
+        return hash_string(string_or_hash)
+    except:
+        if _try_coerce_to_hash(string_or_hash, &str_hash):
+            # Coerce the integral key to the expected primitive hash type.
+            # This ensures that custom/overloaded "primitive" data types
+            # such as those implemented by numpy are not inadvertently used
+            # downsteam (as these are internally implemented as custom PyObjects
+            # whose comparison operators can incur a significant overhead).
+            return str_hash
+        else:
+            raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash)))
+
+
+# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
+cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
+    try:
+        out_hash[0] = key
+        return True
+    except:
+        return False
+
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -58,14 +58,6 @@ cdef struct TokenC:
    hash_t ent_id


-cdef struct MorphAnalysisC:
-    hash_t key
-    int length
-
-    attr_t* fields
-    attr_t* features
-
-
 # Internal struct, for storage and disambiguation of entities.
 cdef struct KBEntryC:

--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -1,5 +1,6 @@
+# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0.
 cdef enum symbol_t:
-    NIL
+    NIL = 0
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
@ -65,7 +66,7 @@ cdef enum symbol_t:
    FLAG62
    FLAG63

-    ID
+    ID = 64
    ORTH
    LOWER
    NORM
@ -385,7 +386,7 @@ cdef enum symbol_t:
    DEPRECATED275
    DEPRECATED276

-    PERSON
+    PERSON = 380
    NORP
    FACILITY
    ORG
@ -405,7 +406,7 @@ cdef enum symbol_t:
    ORDINAL
    CARDINAL

-    acomp
+    acomp = 398
    advcl
    advmod
    agent
@ -458,12 +459,12 @@ cdef enum symbol_t:
    rcmod
    root
    xcomp
-
    acl

-    ENT_KB_ID
+    ENT_KB_ID = 452
    MORPH
    ENT_ID

    IDX
-    _
+    _ = 456
+    # DO NOT ADD ANY NEW SYMBOLS!
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -469,11 +469,7 @@ IDS = {
 }


-def sort_nums(x):
-    return x[1]
-
-
-NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
+NAMES = {v: k for k, v in IDS.items()}
 # Unfortunate hack here, to work around problem with long cpdef enum
 # (which is generating an enormous amount of C++ in Cython 0.24+)
 # We keep the enum cdef, and just make sure the names are available to Python
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #

 To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:

- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
 - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
 - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -1,6 +1,10 @@
 import pytest
 from spacy.util import get_lang_class
+import functools
 from hypothesis import settings
+import inspect
+import importlib
+import sys

 # Functionally disable deadline settings for tests
 # to prevent spurious test failures in CI builds.
@ -47,6 +51,33 @@ def pytest_runtest_setup(item):
            pytest.skip("not referencing any issues")


+# Decorator for Cython-built tests
+# https://shwina.github.io/cython-testing/
+def cytest(func):
+    """
+    Wraps `func` in a plain Python function.
+    """
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        bound = inspect.signature(func).bind(*args, **kwargs)
+        return func(*bound.args, **bound.kwargs)
+
+    return wrapped
+
+
+def register_cython_tests(cython_mod_name: str, test_mod_name: str):
+    """
+    Registers all callables with name `test_*` in Cython module `cython_mod_name`
+    as attributes in module `test_mod_name`, making them discoverable by pytest.
+    """
+    cython_mod = importlib.import_module(cython_mod_name)
+    for name in dir(cython_mod):
+        item = getattr(cython_mod, name)
+        if callable(item) and name.startswith("test_"):
+            setattr(sys.modules[test_mod_name], name, item)
+
+
 # Fixtures for language tokenizers (languages sorted alphabetically)


@ -239,7 +270,7 @@ def hsb_tokenizer():

@pytest.fixture(scope="session")
 def ko_tokenizer():
-    pytest.importorskip("natto")
+    pytest.importorskip("mecab_ko")
    return get_lang_class("ko")().tokenizer


@ -261,6 +292,20 @@ def la_tokenizer():
    return get_lang_class("la")().tokenizer


+@pytest.fixture(scope="session")
+def ko_tokenizer_natto():
+    pytest.importorskip("natto")
+    config = {
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.KoreanNattoTokenizer.v1",
+            }
+        }
+    }
+    nlp = get_lang_class("ko").from_config(config)
+    return nlp.tokenizer
+
+
@pytest.fixture(scope="session")
 def lb_tokenizer():
    return get_lang_class("lb")().tokenizer
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
    assert [t.ent_iob_ for t in doc] == orig_iobs


+def test_ents_clear(en_vocab):
+    """Ensure that removing entities clears token attributes"""
+    text = ["Louisiana", "Office", "of", "Conservation"]
+    doc = Doc(en_vocab, words=text)
+    entity = Span(doc, 0, 4, label=391, span_id="TEST")
+    doc.ents = [entity]
+    doc.ents = []
+    for token in doc:
+        assert token.ent_iob == 2
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.ents = [entity]
+    doc.set_ents([], default="missing")
+    for token in doc:
+        assert token.ent_iob == 0
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.set_ents([], default="blocked")
+    for token in doc:
+        assert token.ent_iob == 3
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+
+
 def test_add_overlapping_entities(en_vocab):
    text = ["Louisiana", "Office", "of", "Conservation"]
    doc = Doc(en_vocab, words=text)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text):
    assert [t.text for t in tokens] == [t.text for t in new_tokens]
    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]

-    new_tokens = Doc(tokens.vocab).from_bytes(
-        tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
-    )
+    new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
    assert tokens.text == new_tokens.text
    assert [t.text for t in tokens] == [t.text for t in new_tokens]
    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer):
    assert len(doc.spans["key2"]) == 1
    doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
    assert len(doc.spans["key3"]) == 2
+
+
+def test_doc_sentiment_from_bytes_v3_to_v4():
+    """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
+    doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
+    doc = Doc(Vocab()).from_bytes(doc_bytes)
+    assert doc.text == "happy"
+    with pytest.raises(AttributeError):
+        doc.sentiment == 1.0
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -4,7 +4,7 @@ from numpy.testing import assert_array_equal

 from spacy.attrs import ORTH, LENGTH
 from spacy.lang.en import English
-from spacy.tokens import Doc, Span, Token
+from spacy.tokens import Doc, Span, SpanGroup, Token
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from thinc.api import get_current_ops
@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
        assert span.text == text


+@pytest.mark.issue(9556)
+def test_modify_span_group(doc):
+    group = SpanGroup(doc, spans=doc.ents)
+    for span in group:
+        span.start = 0
+        span.label = doc.vocab.strings["TEST"]
+
+    # Span changes must be reflected in the span group
+    assert group[0].start == 0
+    assert group[0].label == doc.vocab.strings["TEST"]
+
+
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -293,31 +305,6 @@ def test_span_similarity_match():
        assert span1[:1].similarity(doc.vocab["a"]) == 1.0


-def test_spans_default_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    assert doc[:2].sentiment == 3.0 / 2
-    assert doc[-2:].sentiment == -2.0 / 2
-    assert doc[:-1].sentiment == (3.0 + -2) / 3.0
-
-
-def test_spans_override_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.user_span_hooks["sentiment"] = lambda span: 10.0
-    assert doc[:2].sentiment == 10.0
-    assert doc[-2:].sentiment == 10.0
-    assert doc[:-1].sentiment == 10.0
-
-
 def test_spans_are_hashable(en_tokenizer):
    """Test spans can be hashed."""
    text = "good stuff bad stuff"
@ -680,3 +667,23 @@ def test_span_group_copy(doc):
    assert len(doc.spans["test"]) == 3
    # check that the copy spans were not modified and this is an isolated doc
    assert len(doc_copy.spans["test"]) == 2
+
+
+@pytest.mark.issue(11113)
+def test_span_ent_id(en_tokenizer):
+    doc = en_tokenizer("a b c d")
+    doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
+    span = doc.ents[0]
+    assert doc[1].ent_id_ == "ID0"
+
+    # setting Span.id sets Token.ent_id
+    span.id_ = "ID1"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID1"
+    assert doc[1].ent_id_ == "ID1"
+
+    # Span.ent_id is an alias of Span.id
+    span.ent_id_ = "ID2"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID2"
+    assert doc[1].ent_id_ == "ID2"
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@ -102,8 +102,10 @@ def test_span_group_set_item(doc, other_doc):
    span.label_ = "NEW LABEL"
    span.kb_id = doc.vocab.strings["KB_ID"]

-    assert span_group[index].label != span.label
-    assert span_group[index].kb_id != span.kb_id
+    # Indexing a span group returns a span in which C
+    # data is shared.
+    assert span_group[index].label == span.label
+    assert span_group[index].kb_id == span.kb_id

    span_group[index] = span
    assert span_group[index].start == span.start
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@ -3,6 +3,10 @@ from mock import Mock
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore

+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+

@pytest.fixture(scope="function", autouse=True)
 def clean_underscore():
@ -171,3 +175,118 @@ def test_underscore_docstring(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    assert test_method.__doc__ == "I am a docstring"
    assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
+
+
+def test_underscore_for_unique_span(en_tokenizer):
+    """Test that spans with the same boundaries but with different labels are uniquely identified (see #9706)."""
+    Doc.set_extension(name="doc_extension", default=None)
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    doc._.doc_extension = "doc extension"
+    doc[0]._.token_extension = "token extension"
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    # Assert extensions
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change label of span and assert extensions
+    span_1.label_ = "NEW_LABEL"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change KB_ID and assert extensions
+    span_1.kb_id_ = "KB_ID"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change extensions and assert
+    span_2._.span_extension = "updated span_2 extension"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Change span ID and assert extensions
+    span_2.id = 2
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Assert extensions with original key
+    assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
+    assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
+
+
+def test_underscore_for_unique_span_from_docs(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text_1 = "Hello, world!"
+    doc_1 = en_tokenizer(text_1)
+    span_1a = Span(doc_1, 0, 2, "SPAN_1a")
+    span_1b = Span(doc_1, 0, 2, "SPAN_1b")
+
+    text_2 = "This is a test."
+    doc_2 = en_tokenizer(text_2)
+    span_2a = Span(doc_2, 0, 3, "SPAN_2a")
+
+    # Set custom extensions
+    doc_1[0]._.token_extension = "token_1"
+    doc_2[1]._.token_extension = "token_2"
+    span_1a._.span_extension = "span_1a extension"
+    span_1b._.span_extension = "span_1b extension"
+    span_2a._.span_extension = "span_2a extension"
+
+    doc = Doc.from_docs([doc_1, doc_2])
+    # Assert extensions
+    assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
+
+    # Check extensions on merged doc
+    assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert (
+        doc.user_data[
+            (
+                "._.",
+                "span_extension",
+                span_2a.start_char + len(doc_1.text) + 1,
+                span_2a.end_char + len(doc_1.text) + 1,
+                span_2a.label,
+                span_2a.kb_id,
+                span_2a.id,
+            )
+        ]
+        == "span_2a extension"
+    )
+
+
+def test_underscore_for_unique_span_as_span(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Span.as_doc"""
+    Span.set_extension(name="span_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    span_doc = span_1.as_doc(copy_user_data=True)
+
+    # Assert extensions
+    assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
--- a/spacy/tests/lang/ko/test_lemmatization.py
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@ -7,3 +7,11 @@ import pytest
 def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
    test_lemma = ko_tokenizer(word)[0].lemma_
    assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+    "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
+)
+def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
+    test_lemma = ko_tokenizer_natto(word)[0].lemma_
+    assert test_lemma == lemma
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
    b = pickle.dumps(ko_tokenizer)
    ko_tokenizer_re = pickle.loads(b)
    assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
+
+
+def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
+    tokenizer_bytes = ko_tokenizer_natto.to_bytes()
+    nlp = Korean()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        ko_tokenizer_natto.to_disk(file_path)
+        nlp = Korean()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
+    b = pickle.dumps(ko_tokenizer_natto)
+    ko_tokenizer_natto_re = pickle.loads(b)
+    assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
              "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
 # fmt: on

+# tests for ko_tokenizer (default KoreanTokenizer)
+

@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
    assert pos == expected_pos.split()


-def test_ko_empty_doc(ko_tokenizer):
+def test_ko_tokenizer_empty_doc(ko_tokenizer):
    tokens = ko_tokenizer("")
    assert len(tokens) == 0

@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
    assert tokens[1].pos_ == "X"


+# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
+    tokens = [token.text for token in ko_tokenizer_natto(text)]
+    assert tokens == expected_tokens.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
+def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = [token.tag_ for token in ko_tokenizer_natto(text)]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
+def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = ko_tokenizer_natto(text).user_data["full_tags"]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
+def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
+    pos = [token.pos_ for token in ko_tokenizer_natto(text)]
+    assert pos == expected_pos.split()
+
+
+def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("")
+    assert len(tokens) == 0
+
+
+@pytest.mark.issue(10535)
+def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("미닛 리피터")
+    assert tokens[1].pos_ == "X"
+
+
 # fmt: off
 SPACY_TOKENIZER_TESTS = [
    ("있다.", "있다 ."),
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@ -26,14 +26,6 @@ def test_attrs_idempotence(text):
    assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}


-@pytest.mark.parametrize("text", ["dog"])
-def test_attrs_do_deprecated(text):
-    int_attrs = intify_attrs(
-        {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
-    )
-    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
-
-
 def test_attrs_ent_iob_intify():
    int_attrs = intify_attrs({"ENT_IOB": ""})
    assert int_attrs == {ENT_IOB: 0}
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -50,8 +50,6 @@ def test_matcher_from_usage_docs(en_vocab):

    def label_sentiment(matcher, doc, i, matches):
        match_id, start, end = matches[i]
-        if doc.vocab.strings[match_id] == "HAPPY":
-            doc.sentiment += 0.1
        span = doc[start:end]
        with doc.retokenize() as retokenizer:
            retokenizer.merge(span)
@ -61,7 +59,6 @@ def test_matcher_from_usage_docs(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
    matcher(doc)
-    assert doc.sentiment != 0
    assert doc[1].norm_ == "happy emoji"


@ -793,9 +790,16 @@ def test_matcher_span(matcher):
    doc = Doc(matcher.vocab, words=text.split())
    span_js = doc[:3]
    span_java = doc[4:]
-    assert len(matcher(doc)) == 2
-    assert len(matcher(span_js)) == 1
-    assert len(matcher(span_java)) == 1
+    doc_matches = matcher(doc)
+    span_js_matches = matcher(span_js)
+    span_java_matches = matcher(span_java)
+    assert len(doc_matches) == 2
+    assert len(span_js_matches) == 1
+    assert len(span_java_matches) == 1
+
+    # match offsets always refer to the doc
+    assert doc_matches[0] == span_js_matches[0]
+    assert doc_matches[1] == span_java_matches[0]


 def test_matcher_as_spans(matcher):
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -87,14 +87,15 @@ def test_issue4373():

@pytest.mark.issue(4651)
 def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    """Test that the entity_ruler PhraseMatcher is deserialized correctly using
+    the method from_disk when the entity_ruler argument phrase_matcher_attr is
    specified.
    """
    text = "Spacy is a python library for nlp"
    nlp = English()
    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    config = {"phrase_matcher_attr": "LOWER"}
+    ruler = nlp.add_pipe("entity_ruler", config=config)
    ruler.add_patterns(patterns)
    doc = nlp(text)
    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr():
    with make_tempdir() as d:
        file_path = d / "entityruler"
        ruler.to_disk(file_path)
-        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
+        nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path)
    doc_reloaded = nlp_reloaded(text)
    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
    assert res == res_reloaded
@ -198,28 +199,6 @@ def test_phrase_matcher_contains(en_vocab):
    assert "TEST2" not in matcher


-def test_phrase_matcher_add_new_api(en_vocab):
-    doc = Doc(en_vocab, words=["a", "b"])
-    patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("OLD_API", None, *patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("OLD_API_CALLBACK", on_match, *patterns)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-    # New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("NEW_API", patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-
-
 def test_phrase_matcher_repeated_add(en_vocab):
    matcher = PhraseMatcher(en_vocab)
    # match ID only gets added once
@ -468,6 +447,13 @@ def test_phrase_matcher_deprecated(en_vocab):
        assert "spaCy v3.0" in str(record.list[0].message)


+def test_phrase_matcher_non_doc(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    doc = Doc(en_vocab, words=["hello", "world"])
+    with pytest.raises(ValueError):
+        matcher.add("TEST", [doc, "junk"])
+
+
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
 def test_phrase_matcher_sent_start(en_vocab, attr):
    _ = PhraseMatcher(en_vocab, attr=attr)  # noqa: F841
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -4,8 +4,8 @@ from pathlib import Path

 def test_build_dependencies():
    # Check that library requirements are pinned exactly the same across different setup files.
-    # TODO: correct checks for numpy rather than ignoring
    libs_ignore_requirements = [
+        "cython",
        "pytest",
        "pytest-timeout",
        "mock",
@ -22,7 +22,7 @@ def test_build_dependencies():
    # ignore language-specific packages that shouldn't be installed by all
    libs_ignore_setup = [
        "fugashi",
-        "natto-py",
+        "mecab-ko",
        "pythainlp",
        "sudachipy",
        "sudachidict_core",
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@ -0,0 +1,119 @@
+# cython: infer_types=True, binding=True
+from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
+from spacy.typedefs cimport class_t, weight_t
+from cymem.cymem cimport Pool
+
+from ..conftest import cytest
+import pytest
+
+cdef struct TestState:
+    int length
+    int x
+    Py_UNICODE* string
+
+
+cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
+    dest_state = <TestState*>dest
+    src_state = <TestState*>src
+    dest_state.length = src_state.length
+    dest_state.x = src_state.x
+    dest_state.x += clas
+    if extra_args != NULL:
+        dest_state.string = <Py_UNICODE*>extra_args
+    else:
+        dest_state.string = src_state.string
+
+
+cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
+    state = <TestState*>mem.alloc(1, sizeof(TestState))
+    state.length = n
+    state.x = 1
+    if extra_args == NULL:
+        state.string = u'default'
+    else:
+        state.string = <Py_UNICODE*>extra_args
+    return state
+
+
+cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
+    state = <TestState*>state
+    mem.free(state)
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width",
+    [
+        (2, 3),
+        (3, 6),
+        (4, 20),
+    ]
+)
+def test_init(nr_class, beam_width):
+    b = Beam(nr_class, beam_width)
+    assert b.size == 1
+    assert b.width == beam_width
+    assert b.nr_class == nr_class
+
+@cytest
+def test_init_violn():
+    MaxViolation()
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (2, 3, 3),
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_initialize(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length, s.length
+        assert s.string == 'default'
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length,extra",
+    [
+        (2, 3, 4, None),
+        (3, 6, 15, u"test beam 1"),
+    ]
+)
+def test_initialize_extra(nr_class, beam_width, length, extra):
+    b = Beam(nr_class, beam_width)
+    if extra is None:
+        b.initialize(initialize, destroy, length, NULL)
+    else:
+        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_transition(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    b.set_cell(0, 2, 30, True, 0)
+    b.set_cell(0, 1, 42, False, 0)
+    b.advance(transition, NULL, NULL)
+    assert b.size == 1, b.size
+    assert b.score == 30, b.score
+    s = <TestState*>b.at(0)
+    assert s.x == 3
+    assert b._states[0].score == 30, b._states[0].score
+    b.set_cell(0, 1, 10, True, 0)
+    b.set_cell(0, 2, 20, True, 0)
+    b.advance(transition, NULL, NULL)
+    assert b._states[0].score == 50, b._states[0].score
+    assert b._states[1].score == 40
+    s = <TestState*>b.at(0)
+    assert s.x == 5
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
 import logging

 from ..util import make_tempdir
@ -412,7 +413,7 @@ def test_train_empty():
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
@ -539,11 +540,11 @@ def test_block_ner():
    assert [token.ent_type_ for token in doc] == expected_types


-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
    # Simple test to try and quickly overfit the NER component
    nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper):
        assert ents2[0].label_ == "LOC"
        # Ensure that the predictions are still the same, even after adding a new label
        ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
        ner2.add_label("RANDOM_NEW_LABEL")
        doc3 = nlp2(test_text)
        ents3 = doc3.ents
@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
    assert ents[1].kb_id == 0


+def test_is_distillable():
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    assert ner.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_ner = teacher.add_pipe("ner")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            teacher_ner.add_label(ent[2])
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.00001
+
+    student = English()
+    student_ner = student.add_pipe("ner")
+    student_ner.initialize(
+        get_examples=lambda: train_examples, labels=teacher_ner.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(100):
+        losses = {}
+        student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.0001
+
+    # test the trained model
+    test_text = "I like London."
+    doc = student(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+
+
 def test_beam_ner_scores():
    # Test that we can get confidence values out of the beam_ner pipe
    beam_width = 16
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -1,13 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam

 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed

 from ...pipeline import DependencyParser
 from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
@ -59,6 +63,8 @@ PARTIAL_DATA = [
    ),
 ]

+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1


@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
    assert doc[0].dep != 0


+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
@pytest.mark.skip(
    reason="The step_through API was removed (but should be brought back)"
 )
@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab):
    DependencyParser(en_vocab, model)


-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
    # Test that the parser works with incomplete information
    nlp = English()
@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name):
    assert doc[2].head.i == 1


-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
    # Simple test to try and quickly overfit the dependency parser (normal or beam)
    nlp = English()
    parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -396,16 +457,67 @@ def test_overfitting_IO(pipe_name):
    assert_equal(batch_deps_1, no_batch_deps)


+def test_is_distillable():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+    assert parser.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_parser = teacher.add_pipe("parser")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            teacher_parser.add_label(dep)
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(200):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["parser"] < 0.0001
+
+    student = English()
+    student_parser = student.add_pipe("parser")
+    student_parser.initialize(
+        get_examples=lambda: train_examples, labels=teacher_parser.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(200):
+        losses = {}
+        student_parser.distill(
+            teacher_parser, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["parser"] < 0.0001
+
+    test_text = "I like securities."
+    doc = student(test_text)
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
+
+
 # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
@pytest.mark.parametrize(
    "parser_config",
    [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
    ],
 )
 # fmt: on
--- a/spacy/tests/parser/test_search.py
+++ b/spacy/tests/parser/test_search.py
@ -0,0 +1,3 @@
+from ..conftest import register_cython_tests
+
+register_cython_tests("spacy.tests.parser._search", __name__)
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -1,3 +1,4 @@
+from typing import cast
 import pickle
 import pytest
 from hypothesis import given
@ -6,6 +7,7 @@ from spacy import util
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
+from spacy.pipeline.trainable_pipe import TrainablePipe
 from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.util import make_tempdir
@ -193,6 +195,53 @@ def test_overfitting_IO():
    assert doc4[3].lemma_ == "egg"


+def test_is_distillable():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    assert lemmatizer.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
+    teacher_lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    student = English()
+    student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+    student_lemmatizer.min_tree_freq = 1
+    student_lemmatizer.initialize(
+        get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_lemmatizer.distill(
+            teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    test_text = "She likes blue eggs"
+    doc = student(test_text)
+    assert doc[0].lemma_ == "she"
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+    assert doc[3].lemma_ == "egg"
+
+
 def test_lemmatizer_requires_labels():
    nlp = English()
    nlp.add_pipe("trainable_lemmatizer")
@ -313,3 +362,26 @@ def test_empty_strings():
    no_change = trees.add("xyz", "xyz")
    empty = trees.add("", "")
    assert no_change == empty
+
+
+def test_save_activations():
+    nlp = English()
+    lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = lemmatizer.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "trainable_lemmatizer" not in doc.activations
+
+    lemmatizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+        "probabilities",
+        "tree_ids",
+    ]
+    assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+    assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -1,7 +1,8 @@
-from typing import Callable, Iterable, Dict, Any
+from typing import Callable, Iterable, Dict, Any, cast

 import pytest
 from numpy.testing import assert_equal
+from thinc.types import Ragged

 from spacy import registry, util
 from spacy.attrs import ENT_KB_ID
@ -10,7 +11,7 @@ from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.ml.models.entity_linker import build_span_maker
-from spacy.pipeline import EntityLinker
+from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@ -1203,6 +1204,69 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
    assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL


+def test_save_activations():
+    nlp = English()
+    vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True))
+    assert isinstance(entity_linker, EntityLinker)
+    entity_linker.set_kb(create_kb)
+    assert "Q2146908" in entity_linker.vocab.strings
+    assert "Q2146908" in entity_linker.kb.vocab.strings
+
+    # initialize the NEL pipe
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    nO = entity_linker.model.get_dim("nO")
+
+    nlp.add_pipe("sentencizer", first=True)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    doc = nlp("Russ Cochran was a publisher")
+    assert "entity_linker" not in doc.activations
+
+    entity_linker.save_activations = True
+    doc = nlp("Russ Cochran was a publisher")
+    assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"}
+    ents = doc.activations["entity_linker"]["ents"]
+    assert isinstance(ents, Ragged)
+    assert ents.data.shape == (2, 1)
+    assert ents.data.dtype == "uint64"
+    assert ents.lengths.shape == (1,)
+    scores = doc.activations["entity_linker"]["scores"]
+    assert isinstance(scores, Ragged)
+    assert scores.data.shape == (2, 1)
+    assert scores.data.dtype == "float32"
+    assert scores.lengths.shape == (1,)
+
+
 def test_span_maker_forward_with_empty():
    """The forward pass of the span maker may have a doc with no entities."""
    nlp = English()
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -4,7 +4,7 @@ from spacy import registry
 from spacy.tokens import Doc, Span
 from spacy.language import Language
 from spacy.lang.en import English
-from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities
+from spacy.pipeline import EntityRecognizer, merge_entities
 from spacy.pipeline import SpanRuler
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
@ -12,8 +12,6 @@ from spacy.tests.util import make_tempdir

 from thinc.api import NumpyOps, get_current_ops

-ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
-

@pytest.fixture
 def nlp():
@ -40,13 +38,12 @@ def add_ent_component(doc):


@pytest.mark.issue(3345)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue3345(entity_ruler_factory):
+def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
@ -65,15 +62,14 @@ def test_issue3345(entity_ruler_factory):


@pytest.mark.issue(4849)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue4849(entity_ruler_factory):
+def test_issue4849():
    nlp = English()
    patterns = [
        {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
        {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
    ]
    ruler = nlp.add_pipe(
-        entity_ruler_factory,
+        "entity_ruler",
        name="entity_ruler",
        config={"phrase_matcher_attr": "LOWER"},
    )
@ -96,11 +92,10 @@ def test_issue4849(entity_ruler_factory):


@pytest.mark.issue(5918)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue5918(entity_ruler_factory):
+def test_issue5918():
    # Test edge case when merging entities.
    nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "ORG", "pattern": "Digicon Inc"},
        {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
@ -125,10 +120,9 @@ def test_issue5918(entity_ruler_factory):


@pytest.mark.issue(8168)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue8168(entity_ruler_factory):
+def test_issue8168():
    nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "ORG", "pattern": "Apple"},
        {
@ -148,12 +142,9 @@ def test_issue8168(entity_ruler_factory):


@pytest.mark.issue(8216)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_fix8216(nlp, patterns):
    """Test that patterns don't get added excessively."""
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"validate": True}
-    )
+    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
    ruler.add_patterns(patterns)
    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
    assert pattern_count > 0
@ -162,16 +153,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
    assert after_count == pattern_count


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_init(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert "HELLO" in ruler
    assert "BYE" in ruler
    nlp.remove_pipe("entity_ruler")
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    doc = nlp("hello world bye bye")
    assert len(doc.ents) == 2
@ -179,23 +169,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
    assert doc.ents[1].label_ == "BYE"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_no_patterns_warns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    assert len(ruler) == 0
    assert len(ruler.labels) == 0
    nlp.remove_pipe("entity_ruler")
-    nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    nlp.add_pipe("entity_ruler")
    assert nlp.pipe_names == ["entity_ruler"]
    with pytest.warns(UserWarning):
        doc = nlp("hello world bye bye")
    assert len(doc.ents) == 0


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_patterns(nlp, patterns):
    # initialize with patterns
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    assert len(ruler.labels) == 0
    ruler.initialize(lambda: [], patterns=patterns)
    assert len(ruler.labels) == 4
@ -207,7 +195,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
    nlp.config["initialize"]["components"]["entity_ruler"] = {
        "patterns": {"@misc": "entity_ruler_patterns"}
    }
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    assert len(ruler.labels) == 0
    nlp.initialize()
    assert len(ruler.labels) == 4
@ -216,20 +204,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
    assert doc.ents[1].label_ == "BYE"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_clear(nlp, patterns):
    """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    assert len(ruler.labels) == 4
    ruler.initialize(lambda: [])
    assert len(ruler.labels) == 0


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_clear(nlp, patterns):
    """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    assert len(ruler.labels) == 4
    doc = nlp("hello world")
@ -241,9 +227,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
    assert len(doc.ents) == 0


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_existing(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.add_pipe("add_ent", before="entity_ruler")
    doc = nlp("OH HELLO WORLD bye bye")
@ -252,11 +237,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
    assert doc.ents[1].label_ == "BYE"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_overwrite(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
    ruler.add_patterns(patterns)
    nlp.add_pipe("add_ent", before="entity_ruler")
    doc = nlp("OH HELLO WORLD bye bye")
@ -266,11 +248,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
    assert doc.ents[1].label_ == "BYE"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_complex(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
    ruler.add_patterns(patterns)
    nlp.add_pipe("add_ent", before="entity_ruler")
    doc = nlp("foo foo bye bye")
@ -281,11 +260,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
    assert len(doc.ents[1]) == 2


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_entity_id(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
    ruler.add_patterns(patterns)
    doc = nlp("Apple is a technology company")
    assert len(doc.ents) == 1
@ -293,26 +269,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
    assert doc.ents[0].ent_id_ == "a1"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_cfg_ent_id_sep(nlp, patterns):
    config = {"overwrite_ents": True, "ent_id_sep": "**"}
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config)
+    ruler = nlp.add_pipe("entity_ruler", config=config)
    ruler.add_patterns(patterns)
    doc = nlp("Apple is a technology company")
-    if isinstance(ruler, EntityRuler):
-        assert "TECH_ORG**a1" in ruler.phrase_patterns
    assert len(doc.ents) == 1
    assert doc.ents[0].label_ == "TECH_ORG"
    assert doc.ents[0].ent_id_ == "a1"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns)
+def test_entity_ruler_serialize_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler")
    assert len(new_ruler) == 0
    assert len(new_ruler.labels) == 0
    new_ruler = new_ruler.from_bytes(ruler_bytes)
@ -324,28 +297,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
    assert sorted(new_ruler.labels) == sorted(ruler.labels)


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_phrase_matcher_attr_bytes(
-    nlp, patterns, entity_ruler_factory
-):
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns)
+def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    ruler.add_patterns(patterns)
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"}
+    )
    assert len(new_ruler) == 0
    assert len(new_ruler.labels) == 0
-    assert new_ruler.phrase_matcher_attr is None
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(new_ruler) == len(patterns)
    assert len(new_ruler.labels) == 4
-    assert new_ruler.phrase_matcher_attr == "LOWER"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_validate(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
-    validated_ruler = EntityRuler(nlp, validate=True)
+def test_entity_ruler_validate(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
+    validated_ruler = nlp.add_pipe(
+        "entity_ruler", name="validated_ruler", config={"validate": True}
+    )

    valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
    invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
@ -362,16 +334,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory):
        validated_ruler.add_patterns([invalid_pattern])


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+def test_entity_ruler_properties(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
    assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
-    assert sorted(ruler.ent_ids) == ["a1", "a2"]
+    assert sorted(ruler.ids) == ["a1", "a2"]


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_overlapping_spans(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "FOOBAR", "pattern": "foo bar"},
        {"label": "BARBAZ", "pattern": "bar baz"},
@ -382,9 +353,8 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
    assert doc.ents[0].label_ == "FOOBAR"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_fuzzy_pipe(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
    ruler.add_patterns(patterns)
    doc = nlp("helloo")
@ -392,9 +362,8 @@ def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
    assert doc.ents[0].label_ == "HELLO"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_fuzzy(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
    ruler.add_patterns(patterns)
    doc = nlp("helloo")
@ -402,15 +371,13 @@ def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
    assert doc.ents[0].label_ == "HELLO"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
+def test_entity_ruler_fuzzy_disabled(nlp):
    @registry.misc("test_fuzzy_compare_disabled")
    def make_test_fuzzy_compare_disabled():
        return lambda x, y, z: False

    ruler = nlp.add_pipe(
-        entity_ruler_factory,
-        name="entity_ruler",
+        "entity_ruler",
        config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
    )
    patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@ -420,14 +387,13 @@ def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):


@pytest.mark.parametrize("n_process", [1, 2])
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
+def test_entity_ruler_multiprocessing(nlp, n_process):
    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
        texts = ["I enjoy eating Pizza Hut pizza."]

        patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]

-        ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+        ruler = nlp.add_pipe("entity_ruler")
        ruler.add_patterns(patterns)

        for doc in nlp.pipe(texts, n_process=2):
@ -435,9 +401,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
                assert ent.ent_id_ == "1234"


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_jsonl(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    with make_tempdir() as d:
        ruler.to_disk(d / "test_ruler.jsonl")
@ -446,9 +411,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
            ruler.from_disk(d / "non_existing.jsonl")  # read from a bad jsonl file


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_dir(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    with make_tempdir() as d:
        ruler.to_disk(d / "test_ruler")
@ -457,9 +421,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
            ruler.from_disk(d / "non_existing_dir")  # read from a bad directory


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_basic(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "PERSON", "pattern": "Dina", "id": "dina"},
        {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -469,24 +432,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
    doc = nlp("Dina went to school")
    assert len(ruler.patterns) == 3
    assert len(doc.ents) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
    assert doc.ents[0].label_ == "PERSON"
    assert doc.ents[0].text == "Dina"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
    doc = nlp("Dina went to school")
    assert len(doc.ents) == 0
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
    assert len(ruler.patterns) == 2


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "PERSON", "pattern": "Dina", "id": "dina"},
        {"label": "ORG", "pattern": "DinaCorp", "id": "dina"},
@ -495,25 +450,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory
    ruler.add_patterns(patterns)
    doc = nlp("Dina founded DinaCorp and ACME.")
    assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
-        assert "ORG||dina" in ruler.phrase_matcher
    assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
    doc = nlp("Dina founded DinaCorp and ACME.")
    assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
-        assert "ORG||dina" not in ruler.phrase_matcher
    assert len(doc.ents) == 1


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_nonexisting_pattern(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "PERSON", "pattern": "Dina", "id": "dina"},
        {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -528,9 +473,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
            ruler.remove_by_id("nepattern")


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_several_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "PERSON", "pattern": "Dina", "id": "dina"},
        {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -544,27 +488,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
    assert doc.ents[0].text == "Dina"
    assert doc.ents[1].label_ == "ORG"
    assert doc.ents[1].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
    doc = nlp("Dina founded her company ACME")
    assert len(ruler.patterns) == 2
    assert len(doc.ents) == 1
    assert doc.ents[0].label_ == "ORG"
    assert doc.ents[0].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
    doc = nlp("Dina founded her company ACME")
    assert len(ruler.patterns) == 1
    assert len(doc.ents) == 0


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_patterns_in_a_row(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "PERSON", "pattern": "Dina", "id": "dina"},
        {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -580,21 +517,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
    assert doc.ents[1].text == "ACME"
    assert doc.ents[2].label_ == "DATE"
    assert doc.ents[2].text == "her birthday"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-        ruler.remove("acme")
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("dina")
-        ruler.remove_by_id("acme")
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("dina")
+    ruler.remove_by_id("acme")
+    ruler.remove_by_id("bday")
    doc = nlp("Dina went to school")
    assert len(doc.ents) == 0


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_all_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "PERSON", "pattern": "Dina", "id": "dina"},
        {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -602,29 +533,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
    ]
    ruler.add_patterns(patterns)
    assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
    assert len(ruler.patterns) == 2
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
    assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("bday")
    assert len(ruler.patterns) == 0
    with pytest.warns(UserWarning):
        doc = nlp("Dina founded her company ACME on her birthday")
        assert len(doc.ents) == 0


-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_and_add(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
    patterns = [{"label": "DATE", "pattern": "last time"}]
    ruler.add_patterns(patterns)
    doc = ruler(
@ -645,10 +566,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
    assert doc.ents[0].text == "last time"
    assert doc.ents[1].label_ == "DATE"
    assert doc.ents[1].text == "this time"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
    doc = ruler(
        nlp.make_doc("I saw him last time we met, this time he brought some flowers")
    )
@ -671,10 +589,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
    )
    assert len(ruler.patterns) == 3
    assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
    doc = ruler(
        nlp.make_doc(
            "I saw him last time we met, this time he brought some flowers, another time some chocolate."
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@ -9,7 +9,7 @@ from thinc.types import Array2d, Ragged

 from spacy.lang.en import English
 from spacy.ml import FeatureExtractor, StaticVectors
-from spacy.ml._character_embed import CharacterEmbed
+from spacy.ml.character_embed import CharacterEmbed
 from spacy.tokens import Doc


--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_equal

@ -7,6 +8,7 @@ from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
 from spacy.morphology import Morphology
+from spacy.pipeline import TrainablePipe
 from spacy.attrs import MORPH
 from spacy.tokens import Doc

@ -48,6 +50,12 @@ def test_implicit_label():
    nlp.initialize(get_examples=lambda: train_examples)


+def test_is_distillable():
+    nlp = English()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.is_distillable
+
+
 def test_no_resize():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
@ -197,3 +205,25 @@ def test_overfitting_IO():
    gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
+
+
+def test_save_activations():
+    nlp = English()
+    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+    train_examples = []
+    for inst in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "morphologizer" not in doc.activations
+
+    morphologizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {
+        "label_ids",
+        "probabilities",
+    }
+    assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+    assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -529,17 +529,6 @@ def test_pipe_label_data_no_labels(pipe):
        assert "labels" not in get_arg_names(initialize)


-def test_warning_pipe_begin_training():
-    with pytest.warns(UserWarning, match="begin_training"):
-
-        class IncompatPipe(TrainablePipe):
-            def __init__(self):
-                ...
-
-            def begin_training(*args, **kwargs):
-                ...
-
-
 def test_pipe_methods_initialize():
    """Test that the [initialize] config reflects the components correctly."""
    nlp = Language()
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_equal
 from spacy.attrs import SENT_START
@ -6,9 +7,16 @@ from spacy import util
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
+from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir


+def test_is_distillable():
+    nlp = English()
+    senter = nlp.add_pipe("senter")
+    assert senter.is_distillable
+
+
 def test_label_types():
    nlp = Language()
    senter = nlp.add_pipe("senter")
@ -101,3 +109,26 @@ def test_overfitting_IO():
    # test internal pipe labels vs. Language.pipe_labels with hidden labels
    assert nlp.get_pipe("senter").labels == ("I", "S")
    assert "senter" not in nlp.pipe_labels
+
+
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = senter.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "senter" not in doc.activations
+
+    senter.save_activations = True
+    doc = nlp("This is a test.")
+    assert "senter" in doc.activations
+    assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+    assert doc.activations["senter"]["label_ids"].shape == (5,)
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -1,15 +1,15 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged
+from thinc.api import get_current_ops, Ragged, fix_random_seed

 from spacy import util
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tokens import SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import fix_random_seed, registry, make_tempdir
+from spacy.util import registry, make_tempdir

 OPS = get_current_ops()

@ -444,3 +444,23 @@ def test_set_candidates():
    assert len(docs[0].spans["candidates"]) == 9
    assert docs[0].spans["candidates"][0].text == "Just"
    assert docs[0].spans["candidates"][4].text == "Just a"
+
+
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+    train_examples = make_examples(nlp)
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = spancat.model.get_dim("nO")
+    assert nO == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    doc = nlp("This is a test.")
+    assert "spancat" not in doc.activations
+
+    spancat.save_activations = True
+    doc = nlp("This is a test.")
+    assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
+    assert doc.activations["spancat"]["indices"].shape == (12, 2)
+    assert doc.activations["spancat"]["scores"].shape == (12, nO)
--- a/Show More
+++ b/Show More