Merge branch 'v4' into cleanup/move-legacy-entity-linker

2025-09-12 15:12:39 +03:00 · 2023-01-23 18:31:35 +09:00 · 2023-01-23 18:31:35 +09:00 · 5a5891608c
commit 5a5891608c
parent f8d05e5aa1 6348a7a4b4
316 changed files with 30057 additions and 35893 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ spacy-legacy>=3.0.12,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
--- a/setup.cfg
+++ b/setup.cfg
@ -22,6 +22,7 @@ classifiers =
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -38,7 +39,7 @@ install_requires =
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev1,<9.1.0
+    thinc>=9.0.0.dev2,<9.1.0
    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
@ -65,7 +66,7 @@ console_scripts =
 lookups =
    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.1.2,<1.2.0
+    spacy_transformers>=1.1.2,<1.3.0
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
--- a/setup.py
+++ b/setup.py
@ -33,12 +33,10 @@ MOD_NAMES = [
    "spacy.kb.candidate",
    "spacy.kb.kb",
    "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
    "spacy.morphology",
-    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
    "spacy.pipeline.pipe",
    "spacy.pipeline.trainable_pipe",
    "spacy.pipeline.sentencizer",
@ -46,6 +44,7 @@ MOD_NAMES = [
    "spacy.pipeline.tagger",
    "spacy.pipeline.transition_parser",
    "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
    "spacy.pipeline._parser_internals.ner",
    "spacy.pipeline._parser_internals.nonproj",
    "spacy.pipeline._parser_internals.search",
@ -53,6 +52,7 @@ MOD_NAMES = [
    "spacy.pipeline._parser_internals.stateclass",
    "spacy.pipeline._parser_internals.transition_system",
    "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
    "spacy.tokenizer",
    "spacy.training.align",
    "spacy.training.gold_io",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "4.0.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None:


 def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+    """Given a directory and a suffix, recursively find all files matching the suffix.
+    Directories or files with names beginning with a . are ignored, but hidden flags on
+    filesystems are not checked.
+    When provided with a suffix `None`, there is no suffix-based filtering."""
    if not path.is_dir():
        return [path]
    paths = [path]
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -53,9 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
    """
    for entry in srsly.read_jsonl(path):
        if field not in entry:
-            msg.fail(
-                f"{path} does not contain the required '{field}' field.", exits=1
-            )
+            msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
        else:
            yield entry[field]

@ -118,8 +116,10 @@ def apply(
    paths = walk_directory(data_path)
    if len(paths) == 0:
        docbin.to_disk(output_file)
-        msg.warn("Did not find data to process,"
-                 f" {data_path} seems to be an empty directory.")
+        msg.warn(
+            "Did not find data to process,"
+            f" {data_path} seems to be an empty directory."
+        )
        return
    nlp = load_model(model)
    msg.good(f"Loaded model {model}")
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
    "json": json_to_docs,
 }

+AUTO = "auto"
+

 # File types that can be written to stdout
 FILE_TYPES_STDOUT = ("json",)
@ -49,7 +51,7 @@ def convert_cli(
    model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
-    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
+    converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
    ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
    concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@ -70,8 +72,8 @@ def convert_cli(
    output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
    silent = output_dir == "-"
    msg = Printer(no_print=silent)
-    verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
    converter = _get_converter(msg, converter, input_path)
+    verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
    convert(
        input_path,
        output_dir,
@ -100,7 +102,7 @@ def convert(
    model: Optional[str] = None,
    morphology: bool = False,
    merge_subtokens: bool = False,
-    converter: str = "auto",
+    converter: str,
    ner_map: Optional[Path] = None,
    lang: Optional[str] = None,
    concatenate: bool = False,
@ -212,18 +214,22 @@ def verify_cli_args(
        input_locs = walk_directory(input_path, converter)
        if len(input_locs) == 0:
            msg.fail("No input files in directory", input_path, exits=1)
-        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
-        if converter == "auto" and len(file_types) >= 2:
-            file_types_str = ",".join(file_types)
-            msg.fail("All input files must be same type", file_types_str, exits=1)
-    if converter != "auto" and converter not in CONVERTERS:
+    if converter not in CONVERTERS:
        msg.fail(f"Can't find converter for {converter}", exits=1)


 def _get_converter(msg, converter, input_path: Path):
    if input_path.is_dir():
-        input_path = walk_directory(input_path, converter)[0]
-    if converter == "auto":
+        if converter == AUTO:
+            input_locs = walk_directory(input_path, suffix=None)
+            file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+            if len(file_types) >= 2:
+                file_types_str = ",".join(file_types)
+                msg.fail("All input files must be same type", file_types_str, exits=1)
+            input_path = input_locs[0]
+        else:
+            input_path = walk_directory(input_path, suffix=converter)[0]
+    if converter == AUTO:
        converter = input_path.suffix[1:]
    if converter == "ner" or converter == "iob":
        with input_path.open(encoding="utf8") as file_:
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -87,12 +87,11 @@ grad_factor = 1.0
 factory = "parser"

 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null

 [components.parser.model.tok2vec]
@ -108,12 +107,11 @@ grad_factor = 1.0
 factory = "ner"

 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null

 [components.ner.model.tok2vec]
@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"

 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null

 [components.parser.model.tok2vec]
@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"

 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null

 [components.ner.model.tok2vec]
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -11,6 +11,7 @@ from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 from ..tokens import Doc, Span
 from ..errors import Errors, Warnings
 from ..util import is_in_jupyter
+from ..util import find_available_port


 _html = {}
@ -36,7 +37,7 @@ def render(
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
-    RETURNS (str): Rendered HTML markup.
+    RETURNS (str): Rendered SVG or HTML markup.

    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers
@ -82,6 +83,7 @@ def serve(
    manual: bool = False,
    port: int = 5000,
    host: str = "0.0.0.0",
+    auto_select_port: bool = False,
 ) -> None:
    """Serve displaCy visualisation.

@ -93,15 +95,20 @@ def serve(
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    port (int): Port to serve visualisation.
    host (str): Host to serve visualisation.
+    auto_select_port (bool): Automatically select a port if the specified port is in use.

    DOCS: https://spacy.io/api/top-level#displacy.serve
    USAGE: https://spacy.io/usage/visualizers
    """
    from wsgiref import simple_server

+    port = find_available_port(port, host, auto_select_port)
+
    if is_in_jupyter():
        warnings.warn(Warnings.W011)
-    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
+    render(
+        docs, style=style, page=page, minify=minify, options=options, manual=manual
+    )
    httpd = simple_server.make_server(host, port, app)
    print(f"\nUsing the '{style}' visualizer")
    print(f"Serving on http://{host}:{port} ...\n")
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -94,7 +94,7 @@ class SpanRenderer:
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (str): Rendered HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
@ -510,7 +510,7 @@ class EntityRenderer:
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (str): Rendered HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -207,6 +207,9 @@ class Warnings(metaclass=ErrorsWithCodes):
            "is a Cython extension type.")
    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
+    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
+
+    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")


 class Errors(metaclass=ErrorsWithCodes):
@ -944,11 +947,20 @@ class Errors(metaclass=ErrorsWithCodes):
    E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
             "knowledge base, use `InMemoryLookupKB`.")
    E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
+    E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
+    E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
+             "with `displacy.serve(doc, port)`")
+    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
+             "or use `auto_switch_port=True` to pick an available port automatically.")

    # v4 error strings
    E4000 = ("Expected a Doc as input, but got: '{type}'")
    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
             "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")


 # fmt: on
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -5,7 +5,6 @@ from .attrs cimport attr_id_t
 from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG

 from .structs cimport LexemeC
-from .strings cimport StringStore
 from .vocab cimport Vocab


--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t

 from typing import Optional

+from ..util import registry
+

 cdef extern from "polyleven.c":
    int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
    if k is None:
        k = -1
    return polyleven(<PyObject*>a, <PyObject*>b, k)
+
+
+cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
+    if fuzzy >= 0:
+        max_edits = fuzzy
+    else:
+        # allow at least two edits (to allow at least one transposition) and up
+        # to 20% of the pattern string length
+        max_edits = max(2, round(0.3 * len(pattern_text)))
+    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
+
+
+@registry.misc("spacy.levenshtein_compare.v1")
+def make_levenshtein_compare():
+    return levenshtein_compare
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@ -77,3 +77,4 @@ cdef class Matcher:
    cdef public object _extensions
    cdef public object _extra_predicates
    cdef public object _seen_attrs
+    cdef public object _fuzzy_compare
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -5,7 +5,8 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span

 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
+    def __init__(self, vocab: Vocab, validate: bool = ...,
+                 fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
    def __reduce__(self) -> Any: ...
    def __len__(self) -> int: ...
    def __contains__(self, key: str) -> bool: ...
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: binding=True, infer_types=True, profile=True
 from typing import List, Iterable

 from libcpp.vector cimport vector
@ -20,10 +20,12 @@ from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB

+from .levenshtein import levenshtein_compare
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
 from ..strings cimport get_string_id
 from ..attrs import IDS
+from ..util import registry


 DEF PADDING = 5
@ -36,11 +38,13 @@ cdef class Matcher:
    USAGE: https://spacy.io/usage/rule-based-matching
    """

-    def __init__(self, vocab, validate=True):
+    def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
        """Create the Matcher.

        vocab (Vocab): The vocabulary object, which must be shared with the
-            documents the matcher will operate on.
+        validate (bool): Validate all patterns added to this matcher.
+        fuzzy_compare (Callable[[str, str, int], bool]): The comparison method
+            for the FUZZY operators.
        """
        self._extra_predicates = []
        self._patterns = {}
@ -51,9 +55,10 @@ cdef class Matcher:
        self.vocab = vocab
        self.mem = Pool()
        self.validate = validate
+        self._fuzzy_compare = fuzzy_compare

    def __reduce__(self):
-        data = (self.vocab, self._patterns, self._callbacks)
+        data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
        return (unpickle_matcher, data, None, None)

    def __len__(self):
@ -128,7 +133,7 @@ cdef class Matcher:
        for pattern in patterns:
            try:
                specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates)
+                    self._extensions, self._extra_predicates, self._fuzzy_compare)
                self.patterns.push_back(init_pattern(self.mem, key, specs))
                for spec in specs:
                    for attr, _ in spec[1]:
@ -327,8 +332,8 @@ cdef class Matcher:
            return key


-def unpickle_matcher(vocab, patterns, callbacks):
-    matcher = Matcher(vocab)
+def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
+    matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
    for key, pattern in patterns.items():
        callback = callbacks.get(key, None)
        matcher.add(key, pattern, on_match=callback)
@ -755,7 +760,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
    return id_attr.value


-def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
    """This function interprets the pattern, converting the various bits of
    syntactic sugar before we compile it into a struct with init_pattern.

@ -782,7 +787,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
        ops = _get_operators(spec)
        attr_values = _get_attr_values(spec, string_store)
        extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
        for op in ops:
            tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
    return tokens
@ -827,16 +832,45 @@ def _get_attr_values(spec, string_store):
 # These predicate helper classes are used to match the REGEX, IN, >= etc
 # extensions to the matcher introduced in #3173.

+class _FuzzyPredicate:
+    operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5",
+                 "FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9")
+
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
+        self.i = i
+        self.attr = attr
+        self.value = value
+        self.predicate = predicate
+        self.is_extension = is_extension
+        if self.predicate not in self.operators:
+            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
+        fuzz = self.predicate[len("FUZZY"):] # number after prefix
+        self.fuzzy = int(fuzz) if fuzz else -1
+        self.fuzzy_compare = fuzzy_compare
+        self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+
+    def __call__(self, Token token):
+        if self.is_extension:
+            value = token._.get(self.attr)
+        else:
+            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
+        if self.value == value:
+            return True
+        return self.fuzzy_compare(value, self.value, self.fuzzy)
+
+
 class _RegexPredicate:
    operators = ("REGEX",)

-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.value = re.compile(value)
        self.predicate = predicate
        self.is_extension = is_extension
-        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))

@ -851,18 +885,28 @@ class _RegexPredicate:
 class _SetPredicate:
    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")

-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.vocab = vocab
+        self.regex = regex
+        self.fuzzy = fuzzy
+        self.fuzzy_compare = fuzzy_compare
        if self.attr == MORPH:
            # normalize morph strings
            self.value = set(self.vocab.morphology.add(v) for v in value)
        else:
-            self.value = set(get_string_id(v) for v in value)
+            if self.regex:
+                self.value = set(re.compile(v) for v in value)
+            elif self.fuzzy is not None:
+                # add to string store
+                self.value = set(self.vocab.strings.add(v) for v in value)
+            else:
+                self.value = set(get_string_id(v) for v in value)
        self.predicate = predicate
        self.is_extension = is_extension
-        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))

@ -890,9 +934,29 @@ class _SetPredicate:
                return False

        if self.predicate == "IN":
-            return value in self.value
+            if self.regex:
+                value = self.vocab.strings[value]
+                return any(bool(v.search(value)) for v in self.value)
+            elif self.fuzzy is not None:
+                value = self.vocab.strings[value]
+                return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
+                           for v in self.value)
+            elif value in self.value:
+                return True
+            else:
+                return False
        elif self.predicate == "NOT_IN":
-            return value not in self.value
+            if self.regex:
+                value = self.vocab.strings[value]
+                return not any(bool(v.search(value)) for v in self.value)
+            elif self.fuzzy is not None:
+                value = self.vocab.strings[value]
+                return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
+                               for v in self.value)
+            elif value in self.value:
+                return False
+            else:
+                return True
        elif self.predicate == "IS_SUBSET":
            return value <= self.value
        elif self.predicate == "IS_SUPERSET":
@ -907,13 +971,14 @@ class _SetPredicate:
 class _ComparisonPredicate:
    operators = ("==", "!=", ">=", "<=", ">", "<")

-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.value = value
        self.predicate = predicate
        self.is_extension = is_extension
-        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))

@ -936,7 +1001,7 @@ class _ComparisonPredicate:
            return value < self.value


-def _get_extra_predicates(spec, extra_predicates, vocab):
+def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
    predicate_types = {
        "REGEX": _RegexPredicate,
        "IN": _SetPredicate,
@ -950,6 +1015,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
        "<=": _ComparisonPredicate,
        ">": _ComparisonPredicate,
        "<": _ComparisonPredicate,
+        "FUZZY": _FuzzyPredicate,
+        "FUZZY1": _FuzzyPredicate,
+        "FUZZY2": _FuzzyPredicate,
+        "FUZZY3": _FuzzyPredicate,
+        "FUZZY4": _FuzzyPredicate,
+        "FUZZY5": _FuzzyPredicate,
+        "FUZZY6": _FuzzyPredicate,
+        "FUZZY7": _FuzzyPredicate,
+        "FUZZY8": _FuzzyPredicate,
+        "FUZZY9": _FuzzyPredicate,
    }
    seen_predicates = {pred.key: pred.i for pred in extra_predicates}
    output = []
@ -967,22 +1042,47 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
                attr = "ORTH"
            attr = IDS.get(attr.upper())
        if isinstance(value, dict):
-            processed = False
-            value_with_upper_keys = {k.upper(): v for k, v in value.items()}
-            for type_, cls in predicate_types.items():
-                if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
-                    # Don't create a redundant predicates.
-                    # This helps with efficiency, as we're caching the results.
-                    if predicate.key in seen_predicates:
-                        output.append(seen_predicates[predicate.key])
-                    else:
-                        extra_predicates.append(predicate)
-                        output.append(predicate.i)
-                        seen_predicates[predicate.key] = predicate.i
-                    processed = True
-            if not processed:
-                warnings.warn(Warnings.W035.format(pattern=value))
+            output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                     extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
+    return output
+
+
+def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
+                               extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
+    output = []
+    for type_, value in value_dict.items():
+        type_ = type_.upper()
+        cls = predicate_types.get(type_)
+        if cls is None:
+            warnings.warn(Warnings.W035.format(pattern=value_dict))
+            # ignore unrecognized predicate type
+            continue
+        elif cls == _RegexPredicate:
+            if isinstance(value, dict):
+                # add predicates inside regex operator
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                         extra_predicates, seen_predicates,
+                                                         regex=True))
+                continue
+        elif cls == _FuzzyPredicate:
+            if isinstance(value, dict):
+                # add predicates inside fuzzy operator
+                fuzz = type_[len("FUZZY"):] # number after prefix
+                fuzzy_val = int(fuzz) if fuzz else -1
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                         extra_predicates, seen_predicates,
+                                                         fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
+                continue
+        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
+                        regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
+        # Don't create redundant predicates.
+        # This helps with efficiency, as we're caching the results.
+        if predicate.key in seen_predicates:
+            output.append(seen_predicates[predicate.key])
+        else:
+            extra_predicates.append(predicate)
+            output.append(predicate.i)
+            seen_predicates[predicate.key] = predicate.i
    return output


--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
    "update",
    "rehearse",
    "get_loss",
+    "get_teacher_student_loss",
    "initialize",
    "begin_update",
    "finish_update",
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -1,17 +1,20 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import Optional, List, Tuple, Any
 from thinc.types import Floats2d
+from thinc.api import Model
+import warnings

-from ...errors import Errors
+from ...errors import Errors, Warnings
 from ...compat import Literal
 from ...util import registry
-from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
+from ...tokens.doc import Doc
+
+TransitionSystem = Any  # TODO
+State = Any  # TODO


-@registry.architectures("spacy.TransitionBasedParser.v2")
-def build_tb_parser_model(
+@registry.architectures.register("spacy.TransitionBasedParser.v2")
+def transition_parser_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    state_type: Literal["parser", "ner"],
    extra_state_tokens: bool,
@ -19,6 +22,46 @@ def build_tb_parser_model(
    maxout_pieces: int,
    use_upper: bool,
    nO: Optional[int] = None,
+) -> Model:
+    if not use_upper:
+        warnings.warn(Warnings.W400)
+
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v3")
+def transition_parser_v3(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+) -> Model:
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+def build_tb_parser_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
 ) -> Model:
    """
    Build a transition-based parser model. Can apply to NER or dependency-parsing.
@ -51,14 +94,7 @@ def build_tb_parser_model(
        feature sets (for the NER) or 13 (for the parser).
    hidden_width (int): The width of the hidden layer.
    maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
-        is replaced with a ReLu non-linearity if use_upper=True, and no
-        non-linearity if use_upper=False.
-    use_upper (bool): Whether to use an additional hidden layer after the state
-        vector in order to predict the action scores. It is recommended to set
-        this to False for large pretrained models such as transformers, and True
-        for smaller networks. The upper layer is computed on CPU, which becomes
-        a bottleneck on larger GPU-based models, where it's also less necessary.
+        Recommended values are 1, 2 or 3.
    nO (int or None): The number of actions the model will predict between.
        Usually inferred from data at the beginning of training, or loaded from
        disk.
@ -69,106 +105,11 @@ def build_tb_parser_model(
        nr_feature_tokens = 6 if extra_state_tokens else 3
    else:
        raise ValueError(Errors.E917.format(value=state_type))
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(
-        tok2vec,
-        list2array(),
-        Linear(hidden_width, t2v_width),
+    return TransitionModel(
+        tok2vec=tok2vec,
+        state_tokens=nr_feature_tokens,
+        hidden_width=hidden_width,
+        maxout_pieces=maxout_pieces,
+        nO=nO,
+        unseen_classes=set(),
    )
-    tok2vec.set_dim("nO", hidden_width)
-    lower = _define_lower(
-        nO=hidden_width if use_upper else nO,
-        nF=nr_feature_tokens,
-        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
-    )
-    upper = None
-    if use_upper:
-        with use_ops("cpu"):
-            # Initialize weights at zero, as it's a classification layer.
-            upper = _define_upper(nO=nO, nI=None)
-    return TransitionModel(tok2vec, lower, upper, resize_output)
-
-
-def _define_upper(nO, nI):
-    return Linear(nO=nO, nI=nI, init_W=zero_init)
-
-
-def _define_lower(nO, nF, nI, nP):
-    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
-
-
-def resize_output(model, new_nO):
-    if model.attrs["has_upper"]:
-        return _resize_upper(model, new_nO)
-    return _resize_lower(model, new_nO)
-
-
-def _resize_upper(model, new_nO):
-    upper = model.get_ref("upper")
-    if upper.has_dim("nO") is None:
-        upper.set_dim("nO", new_nO)
-        return model
-    elif new_nO == upper.get_dim("nO"):
-        return model
-
-    smaller = upper
-    nI = smaller.maybe_get_dim("nI")
-    with use_ops("cpu"):
-        larger = _define_upper(nO=new_nO, nI=nI)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc2f(new_nO, nI)
-        larger_b = larger.ops.alloc1f(new_nO)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        # Weights are stored in (nr_out, nr_in) format, so we're basically
-        # just adding rows here.
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:old_nO] = smaller_W
-            larger_b[:old_nO] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-    model._layers[-1] = larger
-    model.set_ref("upper", larger)
-    return model
-
-
-def _resize_lower(model, new_nO):
-    lower = model.get_ref("lower")
-    if lower.has_dim("nO") is None:
-        lower.set_dim("nO", new_nO)
-        return model
-
-    smaller = lower
-    nI = smaller.maybe_get_dim("nI")
-    nF = smaller.maybe_get_dim("nF")
-    nP = smaller.maybe_get_dim("nP")
-    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
-        larger_b = larger.ops.alloc2f(new_nO, nP)
-        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        smaller_pad = smaller.get_param("pad")
-        # Copy the old weights and padding into the new layer
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:, 0:old_nO, :, :] = smaller_W
-            larger_pad[:, :, 0:old_nO, :] = smaller_pad
-            larger_b[0:old_nO, :] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-        larger.set_param("pad", larger_pad)
-    model._layers[1] = larger
-    model.set_ref("lower", larger)
-    return model
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@ -1,49 +0,0 @@
-from libc.string cimport memset, memcpy
-from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
-from ..pipeline._parser_internals._state cimport StateC
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const float* seen_classes
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* scores
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
-
-
-cdef WeightsC get_c_weights(model) except *
-
-cdef SizesC get_c_sizes(model, int batch_size) except *
-
-cdef ActivationsC alloc_activations(SizesC n) nogil
-
-cdef void free_activations(const ActivationsC* A) nogil
-
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@ -1,500 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-cimport numpy as np
-from libc.math cimport exp
-from libc.string cimport memset, memcpy
-from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.cblas cimport saxpy, sgemm
-
-import numpy
-import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
-
-from .. import util
-from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
-from ..pipeline._parser_internals.stateclass cimport StateClass
-
-
-cdef WeightsC get_c_weights(model) except *:
-    cdef WeightsC output
-    cdef precompute_hiddens state2vec = model.state2vec
-    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
-    cdef np.ndarray vec2scores_W
-    cdef np.ndarray vec2scores_b
-    if model.vec2scores is None:
-        output.hidden_weights = NULL
-        output.hidden_bias = NULL
-    else:
-        vec2scores_W = model.vec2scores.get_param("W")
-        vec2scores_b = model.vec2scores.get_param("b")
-        output.hidden_weights = <const float*>vec2scores_W.data
-        output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray class_mask = model._class_mask
-    output.seen_classes = <const float*>class_mask.data
-    return output
-
-
-cdef SizesC get_c_sizes(model, int batch_size) except *:
-    cdef SizesC output
-    output.states = batch_size
-    if model.vec2scores is None:
-        output.classes = model.state2vec.get_dim("nO")
-    else:
-        output.classes = model.vec2scores.get_dim("nO")
-    output.hiddens = model.state2vec.get_dim("nO")
-    output.pieces = model.state2vec.get_dim("nP")
-    output.feats = model.state2vec.get_dim("nF")
-    output.embed_width = model.tokvecs.shape[1]
-    return output
-
-
-cdef ActivationsC alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    resize_activations(&A, n)
-    return A
-
-
-cdef void free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.scores)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
-        A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
-    resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = _arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    memset(A.scores, 0, n.states * n.classes * sizeof(float))
-    if W.hidden_weights == NULL:
-        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = A.scores[0]
-    for i in range(1, n.states * n.classes):
-        if A.scores[i] < min_:
-            min_ = A.scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if not W.seen_classes[j]:
-                A.scores[i*n.classes+j] = min_
-
-
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
-    cdef const float* feature
-    padding = cached
-    cached += F * O
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
-
-
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
-    """Do multi-label log loss"""
-    cdef double max_, gmax, Z, gZ
-    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = _arg_max(scores, O)
-
-    if best == -1 or guess == -1:
-        # These shouldn't happen, but if they do, we want to make sure we don't
-        # cause an OOB access.
-        return
-    Z = 1e-10
-    gZ = 1e-10
-    max_ = scores[guess]
-    gmax = scores[best]
-    for i in range(O):
-        Z += exp(scores[i] - max_)
-        if costs[i] <= costs[best]:
-            gZ += exp(scores[i] - gmax)
-    for i in range(O):
-        if costs[i] <= costs[best]:
-            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
-        else:
-            d_scores[i] = exp(scores[i]-max_) / Z
-
-
-cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
-    # Find minimum cost
-    cdef float cost = 1
-    for i in range(n):
-        if is_valid[i] and costs[i] < cost:
-            cost = costs[i]
-    # Now find best-scoring with that cost
-    cdef int best = -1
-    for i in range(n):
-        if costs[i] <= cost and is_valid[i]:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-
-class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
-        Model.__init__(self, name="parser_step_model", forward=step_forward)
-        self.attrs["has_upper"] = has_upper
-        self.attrs["dropout_rate"] = dropout
-        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
-        if layers[1].get_dim("nP") >= 2:
-            activation = "maxout"
-        elif has_upper:
-            activation = None
-        else:
-            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, train=train)
-        if has_upper:
-            self.vec2scores = layers[-1]
-        else:
-            self.vec2scores = None
-        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
-        self.backprops = []
-        self._class_mask = numpy.zeros((self.nO,), dtype='f')
-        self._class_mask.fill(1)
-        if unseen_classes is not None:
-            for class_ in unseen_classes:
-                self._class_mask[class_] = 0.
-
-    def clear_memory(self):
-        del self.tokvecs
-        del self.bp_tokvecs
-        del self.state2vec
-        del self.backprops
-        del self._class_mask
-
-    @property
-    def nO(self):
-        if self.attrs["has_upper"]:
-            return self.vec2scores.get_dim("nO")
-        else:
-            return self.state2vec.get_dim("nO")
-
-    def class_is_unseen(self, class_):
-        return self._class_mask[class_]
-
-    def mark_class_unseen(self, class_):
-        self._class_mask[class_] = 0
-
-    def mark_class_seen(self, class_):
-        self._class_mask[class_] = 1
-
-    def get_token_ids(self, states):
-        cdef StateClass state
-        states = [state for state in states if not state.is_final()]
-        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
-                                          dtype='i', order='C')
-        ids.fill(-1)
-        c_ids = <int*>ids.data
-        for state in states:
-            state.c.set_context_tokens(c_ids, ids.shape[1])
-            c_ids += ids.shape[1]
-        return ids
-
-    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            self.backprops.append((
-                util.get_async(self.cuda_stream, token_ids),
-                util.get_async(self.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
-
-    def finish_steps(self, golds):
-        # Add a padding vector to the d_tokvecs gradient, so that missing
-        # values don't affect the real gradient.
-        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
-        # Tells CUDA to block, so our async copies complete.
-        if self.cuda_stream is not None:
-            self.cuda_stream.synchronize()
-        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids))
-            ids = ids.flatten()
-            d_state_features = d_state_features.reshape(
-                (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
-        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1])
-        return d_tokvecs
-
-NUMPY_OPS = NumpyOps()
-
-def step_forward(model: ParserStepModel, states, is_train):
-    token_ids = model.get_token_ids(states)
-    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    mask = None
-    if model.attrs["has_upper"]:
-        dropout_rate = model.attrs["dropout_rate"]
-        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
-            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
-    else:
-        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
-    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
-
-    def backprop_parser_step(d_scores):
-        # Zero vectors for unseen classes
-        d_scores *= model._class_mask
-        d_vector = get_d_vector(d_scores)
-        if mask is not None:
-            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
-        return None
-    return scores, backprop_parser_step
-
-
-cdef class precompute_hiddens:
-    """Allow a model to be "primed" by pre-computing input features in bulk.
-
-    This is used for the parser, where we want to take a batch of documents,
-    and compute vectors for each (token, position) pair. These vectors can then
-    be reused, especially for beam-search.
-
-    Let's say we're using 12 features for each state, e.g. word at start of
-    buffer, three words on stack, their children, etc. In the normal arc-eager
-    system, a document of length N is processed in 2*N states. This means we'll
-    create 2*N*12 feature vectors --- but if we pre-compute, we only need
-    N*12 vector computations. The saving for beam-search is much better:
-    if we have a beam of k, we'll normally make 2*N*12*K computations --
-    so we can save the factor k. This also gives a nice CPU/GPU division:
-    we can do all our hard maths up front, packed into large multiplications,
-    and do the hard-to-program parsing on the CPU.
-    """
-    cdef readonly int nF, nO, nP
-    cdef bint _is_synchronized
-    cdef public object ops
-    cdef public object numpy_ops
-    cdef public object _cpu_ops
-    cdef np.ndarray _features
-    cdef np.ndarray _cached
-    cdef np.ndarray bias
-    cdef object _cuda_stream
-    cdef object _bp_hiddens
-    cdef object activation
-
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", train=False):
-        gpu_cached, bp_features = lower_model(tokvecs, train)
-        cdef np.ndarray cached
-        if not isinstance(gpu_cached, numpy.ndarray):
-            # Note the passing of cuda_stream here: it lets
-            # cupy make the copy asynchronously.
-            # We then have to block before first use.
-            cached = gpu_cached.get(stream=cuda_stream)
-        else:
-            cached = gpu_cached
-        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
-            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
-        else:
-            self.bias = lower_model.get_param("b")
-        self.nF = cached.shape[1]
-        if lower_model.has_dim("nP"):
-            self.nP = lower_model.get_dim("nP")
-        else:
-            self.nP = 1
-        self.nO = cached.shape[2]
-        self.ops = lower_model.ops
-        self.numpy_ops = NumpyOps()
-        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
-        assert activation in (None, "relu", "maxout")
-        self.activation = activation
-        self._is_synchronized = False
-        self._cuda_stream = cuda_stream
-        self._cached = cached
-        self._bp_hiddens = bp_features
-
-    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
-            self._cuda_stream.synchronize()
-            self._is_synchronized = True
-        return <float*>self._cached.data
-
-    def has_dim(self, name):
-        if name == "nF":
-            return self.nF if self.nF is not None else True
-        elif name == "nP":
-            return self.nP if self.nP is not None else True
-        elif name == "nO":
-            return self.nO if self.nO is not None else True
-        else:
-            return False
-
-    def get_dim(self, name):
-        if name == "nF":
-            return self.nF
-        elif name == "nP":
-            return self.nP
-        elif name == "nO":
-            return self.nO
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def set_dim(self, name, value):
-        if name == "nF":
-            self.nF = value
-        elif name == "nP":
-            self.nP = value
-        elif name == "nO":
-            self.nO = value
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def __call__(self, X, bint is_train):
-        if is_train:
-            return self.begin_update(X)
-        else:
-            return self.predict(X), lambda X: X
-
-    def predict(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
-        # This is tricky, but (assuming GPU available);
-        # - Input to forward on CPU
-        # - Output from forward on CPU
-        # - Input to backward on GPU!
-        # - Output from backward on GPU
-        bp_hiddens = self._bp_hiddens
-
-        cdef CBlas cblas = self._cpu_ops.cblas()
-
-        feat_weights = self.get_feat_weights()
-        cdef int[:, ::1] ids = token_ids
-        sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector += self.bias
-        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
-
-        def backward(d_state_vector_ids):
-            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector)
-            d_tokens = bp_hiddens((d_state_vector, token_ids))
-            return d_tokens
-        return state_vector, backward
-
-    def _nonlinearity(self, state_vector):
-        if self.activation == "maxout":
-            return self._maxout_nonlinearity(state_vector)
-        else:
-            return self._relu_nonlinearity(state_vector)
-
-    def _maxout_nonlinearity(self, state_vector):
-        state_vector, mask = self.numpy_ops.maxout(state_vector)
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_maxout(d_best):
-            return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
-        return state_vector, backprop_maxout
-
-    def _relu_nonlinearity(self, state_vector):
-        state_vector = state_vector.reshape((state_vector.shape[0], -1))
-        mask = state_vector >= 0.
-        state_vector *= mask
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_relu(d_best):
-            d_best *= mask
-            return d_best.reshape((d_best.shape + (1,)))
- 
-        return state_vector, backprop_relu
-
-cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
--- a/spacy/ml/tb_framework.pxd
+++ b/spacy/ml/tb_framework.pxd
@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -1,50 +0,0 @@
-from thinc.api import Model, noop
-from .parser_model import ParserStepModel
-from ..util import registry
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@ -7,6 +7,7 @@ from cpython.ref cimport PyObject, Py_XDECREF
 from ...typedefs cimport hash_t, class_t
 from .transition_system cimport TransitionSystem, Transition
 from ...errors import Errors
+from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
@ -26,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
    return state.is_final()


-cdef class BeamBatch(object):
+cdef class BeamBatch(Batch):
    cdef public TransitionSystem moves
    cdef public object states
    cdef public object docs
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -6,7 +6,6 @@ cimport libcpp
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64

 from ...vocab cimport EMPTY_LEXEME
@ -26,7 +25,7 @@ cdef struct ArcC:


 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
    const TokenC* _sent
    vector[int] _stack
    vector[int] _rebuffer
@ -34,31 +33,34 @@ cdef cppclass StateC:
    unordered_map[int, vector[ArcC]] _left_arcs
    unordered_map[int, vector[ArcC]] _right_arcs
    vector[libcpp.bool] _unshiftable
+    vector[int] history
    set[int] _sent_starts
    TokenC _empty_token
    int length
    int offset
    int _b_i

-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
        this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
        this.offset = 0
        this.length = length
        this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
        memset(&this._empty_token, 0, sizeof(TokenC))
        this._empty_token.lex = &EMPTY_LEXEME

-    __dealloc__():
-        free(this._heads)
-
    void set_context_tokens(int* ids, int n) nogil:
        cdef int i, j
        if n == 1:
@ -131,19 +133,20 @@ cdef cppclass StateC:
                ids[i] = -1

    int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
            return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]

    int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
        if i < 0:
            return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
        else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
            if b_i >= this.length:
                return -1
            else:
@ -242,7 +245,7 @@ cdef cppclass StateC:
            return 0
        elif this._sent[word].sent_start == 1:
            return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
            return 1
        else:
            return 0
@ -327,7 +330,7 @@ cdef cppclass StateC:
        if item >= this._unshiftable.size():
            return 0
        else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]

    void set_reshiftable(int item) nogil:
        if item < this._unshiftable.size():
@ -347,6 +350,9 @@ cdef cppclass StateC:
        this._heads[child] = head

    void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
        arcs_it = heads_arcs.find(h_i)
        if arcs_it == heads_arcs.end():
            return
@ -355,12 +361,12 @@ cdef cppclass StateC:
        if arcs.size() == 0:
            return

-        arc = arcs.back()
+        arc = &arcs.back()
        if arc.head == h_i and arc.child == c_i:
            arcs.pop_back()
        else:
            for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                if arc.head == h_i and arc.child == c_i:
                    arc.head = -1
                    arc.child = -1
@ -400,10 +406,11 @@ cdef cppclass StateC:
        this._rebuffer = src._rebuffer
        this._sent_starts = src._sent_starts
        this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
        this._ents = src._ents
        this._left_arcs = src._left_arcs
        this._right_arcs = src._right_arcs
        this._b_i = src._b_i
        this.offset = src.offset
        this._empty_token = src._empty_token
+        this.history = src.history
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem):
        return list(arcs)

    def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
        for word in eg.y[start:end]:
            if word.dep != 0:
                return True
@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem):
                            state.print_state()
                        )))
                    action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                    break
            else:
                failed = False
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -156,7 +156,7 @@ cdef class BiluoPushDown(TransitionSystem):
            if token.ent_type:
                labels.add(token.ent_type_)
        return labels
-    
+
    def move_name(self, int move, attr_t label):
        if move == OUT:
            return 'O'
@ -306,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem):
            for span in eg.y.spans.get(neg_key, []):
                if span.start >= start and span.end <= end:
                    return True
+        if end is not None and end < 0:
+            end = None
        for word in eg.y[start:end]:
            if word.ent_iob != 0:
                return True
@ -646,7 +648,7 @@ cdef class Unit:
                cost += 1
                break
        return cost
- 
+


 cdef class Out:
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@ -20,6 +20,10 @@ cdef class StateClass:
        if self._borrowed != 1:
            del self.c

+    @property
+    def history(self):
+        return list(self.c.history)
+
    @property
    def stack(self):
        return [self.S(i) for i in range(self.c.stack_depth())]
@ -176,3 +180,6 @@ cdef class StateClass:

    def clone(self, StateClass src):
        self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@ -53,3 +53,10 @@ cdef class TransitionSystem:

    cdef int set_costs(self, int* is_valid, weight_t* costs,
                       const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -1,6 +1,8 @@
 # cython: infer_types=True
 from __future__ import print_function
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector

 from collections import Counter
 import srsly
@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t
 from ...tokens.doc cimport Doc
 from ...structs cimport TokenC
 from .stateclass cimport StateClass
+from ._parser_utils cimport arg_max_if_valid

 from ...errors import Errors
 from ... import util
@ -73,7 +76,18 @@ cdef class TransitionSystem:
            offset += len(doc)
        return states

+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
    def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
        states, golds, _ = self.init_gold_batch([example])
        if not states:
            return []
@ -85,6 +99,8 @@ cdef class TransitionSystem:
            return self.get_oracle_sequence_from_state(state, gold)

    def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
        cdef Pool mem = Pool()
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        assert self.n_moves > 0
@ -110,6 +126,7 @@ cdef class TransitionSystem:
                            "S0 head?", str(state.has_head(state.S(0))),
                        )))
                    action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                    break
            else:
                if _debug:
@ -137,6 +154,28 @@ cdef class TransitionSystem:
            raise ValueError(Errors.E170.format(name=name))
        action = self.lookup_transition(name)
        action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]

    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError
@ -250,3 +289,35 @@ cdef class TransitionSystem:
            self.cfg.update(msg['cfg'])
        self.initialize_actions(labels)
        return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config

 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
-from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager

 from .functions import merge_subtokens
 from ..language import Language
@ -18,12 +18,11 @@ from ..util import registry

 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true

 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -123,6 +122,7 @@ def make_parser(
        scorer=scorer,
    )

+
@Language.factory(
    "beam_parser",
    assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@ -228,6 +228,7 @@ def parser_score(examples, **kwargs):

    DOCS: https://spacy.io/api/dependencyparser#score
    """
+
    def has_sents(doc):
        return doc.has_annotation("SENT_START")

@ -235,8 +236,11 @@ def parser_score(examples, **kwargs):
        dep = getattr(token, attr)
        dep = token.vocab.strings.as_string(dep).lower()
        return dep
+
    results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
    kwargs.setdefault("getter", dep_getter)
    kwargs.setdefault("ignore_labels", ("p", "punct"))
    results.update(Scorer.score_deps(examples, "dep", **kwargs))
@ -249,11 +253,12 @@ def make_parser_scorer():
    return parser_score


-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
    """Pipeline component for dependency parsing.

    DOCS: https://spacy.io/api/dependencyparser
    """
+
    TransitionSystem = ArcEager

    def __init__(
@ -273,8 +278,7 @@ cdef class DependencyParser(Parser):
        incorrect_spans_key=None,
        scorer=parser_score,
    ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
        super().__init__(
            vocab,
            model,
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -155,6 +155,25 @@ class EditTreeLemmatizer(TrainablePipe):

        return float(loss), d_scores

+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        n_docs = len(list(docs))
        if not any(len(doc) for doc in docs):
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config

 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
-from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
 from ..language import Language
 from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
 from ..util import registry
 from ..training import remove_bilu_prefix


 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true

 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
        "incorrect_spans_key": None,
        "scorer": {"@scorers": "spacy.ner_scorer.v1"},
    },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
    nlp: Language,
@ -98,6 +102,7 @@ def make_ner(
        scorer=scorer,
    )

+
@Language.factory(
    "beam_ner",
    assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@ -111,7 +116,12 @@ def make_ner(
        "incorrect_spans_key": None,
        "scorer": None,
    },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
    nlp: Language,
@ -185,11 +195,12 @@ def make_ner_scorer():
    return ner_score


-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
    """Pipeline component for named entity recognition.

    DOCS: https://spacy.io/api/entityrecognizer
    """
+
    TransitionSystem = BiluoPushDown

    def __init__(
@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser):
        incorrect_spans_key=None,
        scorer=ner_score,
    ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
        super().__init__(
            vocab,
            model,
            name,
            moves,
            update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
            learn_tokens=False,  # not relevant for NER
            beam_width=beam_width,
            beam_density=beam_density,
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -87,6 +87,10 @@ cdef class Pipe:
            return self.scorer(examples, **scorer_kwargs)
        return {}

+    @property
+    def is_distillable(self) -> bool:
+        return False
+
    @property
    def is_trainable(self) -> bool:
        return False
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@ -13,6 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
 from ..tokens import Doc, Span
 from ..scorer import Scorer, get_ner_prf
 from ..matcher import Matcher, PhraseMatcher
+from ..matcher.levenshtein import levenshtein_compare
 from .. import util

 PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@ -28,6 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
        "overwrite_ents": False,
        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
        "ent_id_sep": "__unused__",
+        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
    },
    default_score_weights={
        "ents_f": 1.0,
@ -40,6 +42,7 @@ def make_entity_ruler(
    nlp: Language,
    name: str,
    phrase_matcher_attr: Optional[Union[int, str]],
+    matcher_fuzzy_compare: Callable,
    validate: bool,
    overwrite_ents: bool,
    scorer: Optional[Callable],
@ -57,6 +60,7 @@ def make_entity_ruler(
        annotate_ents=True,
        ents_filter=ents_filter,
        phrase_matcher_attr=phrase_matcher_attr,
+        matcher_fuzzy_compare=matcher_fuzzy_compare,
        validate=validate,
        overwrite=False,
        scorer=scorer,
@ -81,6 +85,7 @@ def make_entity_ruler_scorer():
        "annotate_ents": False,
        "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
        "phrase_matcher_attr": None,
+        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
        "validate": False,
        "overwrite": True,
        "scorer": {
@ -103,6 +108,7 @@ def make_span_ruler(
    annotate_ents: bool,
    ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
    phrase_matcher_attr: Optional[Union[int, str]],
+    matcher_fuzzy_compare: Callable,
    validate: bool,
    overwrite: bool,
    scorer: Optional[Callable],
@ -115,6 +121,7 @@ def make_span_ruler(
        annotate_ents=annotate_ents,
        ents_filter=ents_filter,
        phrase_matcher_attr=phrase_matcher_attr,
+        matcher_fuzzy_compare=matcher_fuzzy_compare,
        validate=validate,
        overwrite=overwrite,
        scorer=scorer,
@ -179,7 +186,7 @@ def prioritize_existing_ents_filter(


@registry.misc("spacy.prioritize_existing_ents_filter.v1")
-def make_preverse_existing_ents_filter():
+def make_preserve_existing_ents_filter():
    return prioritize_existing_ents_filter


@ -225,6 +232,7 @@ class SpanRuler(Pipe):
            [Iterable[Span], Iterable[Span]], Iterable[Span]
        ] = util.filter_chain_spans,
        phrase_matcher_attr: Optional[Union[int, str]] = None,
+        matcher_fuzzy_compare: Callable = levenshtein_compare,
        validate: bool = False,
        overwrite: bool = False,
        scorer: Optional[Callable] = partial(
@ -255,6 +263,9 @@ class SpanRuler(Pipe):
        phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
            match on, passed to the internal PhraseMatcher as `attr`. Defaults
            to `None`.
+        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
+            internal Matcher. Defaults to
+            spacy.matcher.levenshtein.levenshtein_compare.
        validate (bool): Whether patterns should be validated, passed to
            Matcher and PhraseMatcher as `validate`.
        overwrite (bool): Whether to remove any existing spans under this spans
@ -275,6 +286,7 @@ class SpanRuler(Pipe):
        self.spans_filter = spans_filter
        self.ents_filter = ents_filter
        self.scorer = scorer
+        self.matcher_fuzzy_compare = matcher_fuzzy_compare
        self._match_label_id_map: Dict[int, Dict[str, str]] = {}
        self.clear()

@ -460,7 +472,11 @@ class SpanRuler(Pipe):
        DOCS: https://spacy.io/api/spanruler#clear
        """
        self._patterns: List[PatternType] = []
-        self.matcher: Matcher = Matcher(self.nlp.vocab, validate=self.validate)
+        self.matcher: Matcher = Matcher(
+            self.nlp.vocab,
+            validate=self.validate,
+            fuzzy_compare=self.matcher_fuzzy_compare,
+        )
        self.phrase_matcher: PhraseMatcher = PhraseMatcher(
            self.nlp.vocab,
            attr=self.phrase_matcher_attr,
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, Config
@ -245,7 +246,6 @@ class Tagger(TrainablePipe):

        DOCS: https://spacy.io/api/tagger#rehearse
        """
-        loss_func = LegacySequenceCategoricalCrossentropy()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -259,12 +259,32 @@ class Tagger(TrainablePipe):
        set_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update(docs)
        tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
        bp_tag_scores(grads)
-        self.finish_update(sgd)
+        if sgd is not None:
+            self.finish_update(sgd)
        losses[self.name] += loss
        return losses

+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -77,7 +77,7 @@ subword_features = true
    default_config={
        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
        "save_activations": False,
    },
    default_score_weights={
@ -130,7 +130,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    )


-@registry.scorers("spacy.textcat_scorer.v1")
+@registry.scorers("spacy.textcat_scorer.v2")
 def make_textcat_scorer():
    return textcat_score

--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -74,7 +74,7 @@ subword_features = true
    default_config={
        "threshold": 0.5,
        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
        "save_activations": False,
    },
    default_score_weights={
@ -127,7 +127,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
    )


-@registry.scorers("spacy.textcat_multilabel_scorer.v1")
+@registry.scorers("spacy.textcat_multilabel_scorer.v2")
 def make_textcat_multilabel_scorer():
    return textcat_multilabel_score

--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -6,7 +6,7 @@ import warnings

 from ..tokens.doc cimport Doc

-from ..training import validate_examples
+from ..training import validate_examples, validate_distillation_examples
 from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
@ -56,6 +56,53 @@ cdef class TrainablePipe(Pipe):
        except Exception as e:
            error_handler(self.name, self, [doc], e)

+
+    def distill(self,
+               teacher_pipe: Optional["TrainablePipe"],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is typically trained on the probability
+        distribution of the teacher, but details may differ per pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/pipe#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        validate_distillation_examples(examples, "TrainablePipe.distill")
+        set_dropout_rate(self.model, drop)
+        for node in teacher_pipe.model.walk():
+            if node.name == "softmax":
+                node.attrs["softmax_normalize"] = True
+        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        bp_student_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
@ -169,6 +216,19 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))

+    def get_teacher_student_loss(self, teacher_scores, student_scores):
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
+
    def create_optimizer(self) -> Optimizer:
        """Create an optimizer for the pipeline component.

@ -205,6 +265,14 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))

+    @property
+    def is_distillable(self) -> bool:
+        # Normally a pipe overrides `get_teacher_student_loss` to implement
+        # distillation. In more exceptional cases, a pipe can provide its
+        # own `distill` implementation. If neither of these methods is
+        # overridden, the pipe does not implement distillation.
+        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
+
    @property
    def is_trainable(self) -> bool:
        return True
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@ -1,21 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.backends.cblas cimport CBlas
-
-from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
-
-
-cdef class Parser(TrainablePipe):
-    cdef public object _rehearsal_model
-    cdef readonly TransitionSystem moves
-    cdef public object _multitasks
-    cdef object _cpu_ops
-
-    cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
-
-    cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -1,5 +1,6 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 from __future__ import print_function
+from typing import Dict, Iterable, List, Optional, Tuple
 from cymem.cymem cimport Pool
 cimport numpy as np
 from itertools import islice
@ -7,25 +8,30 @@ from libcpp.vector cimport vector
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free
 import random
+import contextlib

 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops, get_array_module
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 import numpy.random
 import numpy
 import warnings

-from ._parser_internals.stateclass cimport StateClass
+from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from .trainable_pipe import TrainablePipe
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
+from ..vocab cimport Vocab
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ..typedefs cimport weight_t

 from ..training import validate_examples, validate_get_examples
+from ..training import validate_distillation_examples
 from ..errors import Errors, Warnings
 from .. import util

@ -33,7 +39,7 @@ from .. import util
 NUMPY_OPS = NumpyOps()


-cdef class Parser(TrainablePipe):
+class Parser(TrainablePipe):
    """
    Base class of the DependencyParser and EntityRecognizer.
    """
@ -133,8 +139,9 @@ cdef class Parser(TrainablePipe):
    @property
    def move_names(self):
        names = []
+        cdef TransitionSystem moves = self.moves
        for i in range(self.moves.n_moves):
-            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
            # Explicitly removing the internal "U-" token used for blocking entities
            if name != "U-":
                names.append(name)
@ -203,6 +210,118 @@ cdef class Parser(TrainablePipe):
        # Defined in subclasses, to avoid circular import
        raise NotImplementedError

+    def distill(self,
+               teacher_pipe: Optional[TrainablePipe],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None):
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is trained on the transition probabilities
+        of the teacher.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/dependencyparser#distill
+        """
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        validate_distillation_examples(examples, "TransitionParser.distill")
+
+        set_dropout_rate(self.model, drop)
+
+        student_docs = [eg.predicted for eg in examples]
+
+        max_moves = self.cfg["update_with_oracle_cut_size"]
+        if max_moves >= 1:
+            # Chop sequences into lengths of this many words, to make the
+            # batch uniform length. Since we do not have a gold standard
+            # sequence, we use the teacher's predictions as the gold
+            # standard.
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+        else:
+            states = self.moves.init_batch(student_docs)
+
+        # We distill as follows: 1. we first let the student predict transition
+        # sequences (and the corresponding transition probabilities); (2) we
+        # let the teacher follow the student's predicted transition sequences
+        # to obtain the teacher's transition probabilities; (3) we compute the
+        # gradients of the student's transition distributions relative to the
+        # teacher's distributions.
+
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
+            moves=self.moves, actions=actions)
+        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        backprop_scores((student_states, d_scores))
+
+        if sgd is not None:
+            self.finish_update(sgd)
+
+        losses[self.name] += loss
+
+        return losses
+
+
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+        normalize: bool=False,
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
+        """
+
+        # We can't easily hook up a softmax layer in the parsing model, since
+        # the get_loss does additional masking. So, we could apply softmax
+        # manually here and use Thinc's cross-entropy loss. But it's a bit
+        # suboptimal, since we can have a lot of states that would result in
+        # many kernel launches. Futhermore the parsing model's backprop expects
+        # a XP array, so we'd have to concat the softmaxes anyway. So, like
+        # the get_loss implementation, we'll compute the loss and gradients
+        # ourselves.
+
+        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
+            axis=-1, inplace=True)
+        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
+            axis=-1, inplace=True)
+
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = student_scores - teacher_scores
+        if normalize:
+            d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum() / d_scores.size
+
+        return float(loss), d_scores
+
    def init_multitask_objectives(self, get_examples, pipeline, **cfg):
        """Setup models for secondary objectives, to benefit from multi-task
        learning. This method is intended to be overridden by subclasses.
@ -223,9 +342,6 @@ cdef class Parser(TrainablePipe):

        stream: The sequence of documents to process.
        batch_size (int): Number of documents to accumulate into a working set.
-        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
-            deals with a failing batch of documents. The default function just reraises
-            the exception.

        YIELDS (Doc): Documents, in order.
        """
@ -247,78 +363,29 @@ cdef class Parser(TrainablePipe):
    def predict(self, docs):
        if isinstance(docs, Doc):
            docs = [docs]
+        self._ensure_labels_are_added(docs)
        if not any(len(doc) for doc in docs):
            result = self.moves.init_batch(docs)
            return result
-        if self.cfg["beam_width"] == 1:
-            return self.greedy_parse(docs, drop=0.0)
-        else:
-            return self.beam_parse(
-                docs,
-                drop=0.0,
-                beam_width=self.cfg["beam_width"],
-                beam_density=self.cfg["beam_density"]
-            )
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states_or_beams, _ = self.model.predict(inputs)
+        return states_or_beams

    def greedy_parse(self, docs, drop=0.):
-        cdef vector[StateC*] states
-        cdef StateClass state
-        cdef CBlas cblas = self._cpu_ops.cblas()
+        self._resize()
        self._ensure_labels_are_added(docs)
-        set_dropout_rate(self.model, drop)
-        batch = self.moves.init_batch(docs)
-        model = self.model.predict(docs)
-        weights = get_c_weights(model)
-        for state in batch:
-            if not state.is_final():
-                states.push_back(state.c)
-        sizes = get_c_sizes(model, states.size())
-        with nogil:
-            self._parseC(cblas, &states[0], weights, sizes)
-        model.clear_memory()
-        del model
-        return batch
+        with _change_attrs(self.model, beam_width=1):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states, _ = self.model.predict(inputs)
+        return states

    def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
        self._ensure_labels_are_added(docs)
-        batch = _beam_utils.BeamBatch(
-            self.moves,
-            self.moves.init_batch(docs),
-            None,
-            beam_width,
-            density=beam_density
-        )
-        model = self.model.predict(docs)
-        while not batch.is_done:
-            states = batch.get_unfinished_states()
-            if not states:
-                break
-            scores = model.predict(states)
-            batch.advance(scores)
-        model.clear_memory()
-        del model
-        return list(batch)
-
-    cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
-        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations = alloc_activations(sizes)
-        while sizes.states >= 1:
-            predict_states(cblas, &activations, states, &weights, sizes)
-            # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-            sizes.states = unfinished.size()
-            unfinished.clear()
-        free_activations(&activations)
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            beams, _ = self.model.predict(inputs)
+        return beams

    def set_annotations(self, docs, states_or_beams):
        cdef StateClass state
@ -330,35 +397,6 @@ cdef class Parser(TrainablePipe):
            for hook in self.postprocesses:
                hook(doc)

-    def transition_states(self, states, float[:, ::1] scores):
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        with gil:
-            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
-        cdef int i, guess
-        cdef Transition action
-        for i in range(batch_size):
-            self.moves.set_valid(is_valid, states[i])
-            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-            if guess == -1:
-                # This shouldn't happen, but it's hard to raise an error here,
-                # and we don't want to infinite loop. So, force to end state.
-                states[i].force_final()
-            else:
-                action = self.moves.c[guess]
-                action.do(states[i], action.label)
-        free(is_valid)
-
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        cdef StateClass state
        if losses is None:
@ -370,67 +408,99 @@ cdef class Parser(TrainablePipe):
        )
        for multitask in self._multitasks:
            multitask.update(examples, drop=drop, sgd=sgd)
+        # We need to take care to act on the whole batch, because we might be
+        # getting vectors via a listener.
        n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
        if n_examples == 0:
            return losses
        set_dropout_rate(self.model, drop)
-        # The probability we use beam update, instead of falling back to
-        # a greedy update
-        beam_update_prob = self.cfg["beam_update_prob"]
-        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
-            return self.update_beam(
-                examples,
-                beam_width=self.cfg["beam_width"],
-                sgd=sgd,
-                losses=losses,
-                beam_density=self.cfg["beam_density"]
-            )
+        docs = [eg.x for eg in examples if len(eg.x)]
+
        max_moves = self.cfg["update_with_oracle_cut_size"]
        if max_moves >= 1:
            # Chop sequences into lengths of this many words, to make the
            # batch uniform length.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            init_states, gold_states, _ = self._init_gold_batch(
                examples,
                max_length=max_moves
            )
        else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
-        states_golds = list(zip(states, golds))
-        n_moves = 0
-        while states_golds:
-            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states)
-            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            # Note that the gradient isn't normalized by the batch size
-            # here, because our "samples" are really the states...But we
-            # can't normalize by the number of states either, as then we'd
-            # be getting smaller gradients for states in long sequences.
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+            init_states, gold_states, _ = self.moves.init_gold_batch(examples)

-        backprop_tok2vec(golds)
+        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+            max_moves=max_moves, states=[state.copy() for state in init_states])
+        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
+        if sum(s.shape[0] for s in scores) == 0:
+            return losses
+        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
+            examples, max_moves)
+        backprop_scores((pred_states, d_scores))
        if sgd not in (None, False):
            self.finish_update(sgd)
+        losses[self.name] += float((d_scores**2).sum())
        # Ugh, this is annoying. If we're working on GPU, we want to free the
        # memory ASAP. It seems that Python doesn't necessarily get around to
        # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        del model
+        del backprop_scores
        return losses

+    def get_loss(self, states_scores, examples, max_moves):
+        gold_states, init_states, pred_states, scores = states_scores
+        scores = self.model.ops.xp.vstack(scores)
+        costs = self._get_costs_from_histories(
+            examples,
+            gold_states,
+            init_states,
+            [list(state.history) for state in pred_states],
+            max_moves
+        )
+        xp = get_array_module(scores)
+        best_costs = costs.min(axis=1, keepdims=True)
+        gscores = scores.copy()
+        min_score = scores.min() - 1000
+        assert costs.shape == scores.shape, (costs.shape, scores.shape)
+        gscores[costs > best_costs] = min_score
+        max_ = scores.max(axis=1, keepdims=True)
+        gmax = gscores.max(axis=1, keepdims=True)
+        exp_scores = xp.exp(scores - max_)
+        exp_gscores = xp.exp(gscores - gmax)
+        Z = exp_scores.sum(axis=1, keepdims=True)
+        gZ = exp_gscores.sum(axis=1, keepdims=True)
+        d_scores = exp_scores / Z
+        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
+        return d_scores
+
+    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
+        cdef TransitionSystem moves = self.moves
+        cdef StateClass state
+        cdef int clas
+        cdef int nF = self.model.get_dim("nF")
+        cdef int nO = moves.n_moves
+        cdef int nS = sum([len(history) for history in histories])
+        cdef Pool mem = Pool()
+        cdef np.ndarray costs_i
+        is_valid = <int*>mem.alloc(nO, sizeof(int))
+        batch = list(zip(init_states, histories, gold_states))
+        n_moves = 0
+        output = []
+        while batch:
+            costs = numpy.zeros((len(batch), nO), dtype="f")
+            for i, (state, history, gold) in enumerate(batch):
+                costs_i = costs[i]
+                clas = history.pop(0)
+                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
+                action = moves.c[clas]
+                action.do(state.c, action.label)
+                state.c.history.push_back(clas)
+            output.append(costs)
+            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
+            if n_moves >= max_moves >= 1:
+                break
+            n_moves += 1
+
+        return self.model.ops.xp.vstack(output)
+
    def rehearse(self, examples, sgd=None, losses=None, **cfg):
        """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
        if losses is None:
@ -440,10 +510,9 @@ cdef class Parser(TrainablePipe):
                multitask.rehearse(examples, losses=losses, sgd=sgd)
        if self._rehearsal_model is None:
            return None
-        losses.setdefault(self.name, 0.)
+        losses.setdefault(self.name, 0.0)
        validate_examples(examples, "Parser.rehearse")
        docs = [eg.predicted for eg in examples]
-        states = self.moves.init_batch(docs)
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
@ -451,85 +520,33 @@ cdef class Parser(TrainablePipe):
        # Prepare the stepwise model, and get the callback for finishing the batch
        set_dropout_rate(self._rehearsal_model, 0.0)
        set_dropout_rate(self.model, 0.0)
-        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, backprop_tok2vec = self.model.begin_update(docs)
-        n_scores = 0.
-        loss = 0.
-        while states:
-            targets, _ = tutor.begin_update(states)
-            guesses, backprop = model.begin_update(states)
-            d_scores = (guesses - targets) / targets.shape[0]
-            # If all weights for an output are 0 in the original model, don't
-            # supervise that output. This allows us to add classes.
-            loss += (d_scores**2).sum()
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, guesses)
-            states = [state for state in states if not state.is_final()]
-            n_scores += d_scores.size
-        # Do the backprop
-        backprop_tok2vec(docs)
+        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
+        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
+
+        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
+        student_scores = self.model.ops.xp.vstack(student_scores)
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
+        # If all weights for an output are 0 in the original model, don't
+        # supervise that output. This allows us to add classes.
+        loss = (d_scores**2).sum() / d_scores.size
+        backprop_scores((student_states, d_scores))
+
        if sgd is not None:
            self.finish_update(sgd)
-        losses[self.name] += loss / n_scores
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        tutor.clear_memory()
-        del model
-        del tutor
+        losses[self.name] += loss
+
        return losses

    def update_beam(self, examples, *, beam_width,
            drop=0., sgd=None, losses=None, beam_density=0.0):
-        states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, backprop_tok2vec = self.model.begin_update(
-            [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
-            self.moves,
-            states,
-            golds,
-            model,
-            beam_width,
-            beam_density=beam_density,
-        )
-        losses[self.name] += loss
-        backprop_tok2vec(golds)
-        if sgd is not None:
-            self.finish_update(sgd)
-
-    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
-        cdef StateClass state
-        cdef Pool mem = Pool()
-        cdef int i
-
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
-        c_d_scores = <float*>d_scores.data
-        unseen_classes = self.model.attrs["unseen_classes"]
-        for i, (state, gold) in enumerate(zip(states, golds)):
-            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
-            memset(costs, 0, self.moves.n_moves * sizeof(float))
-            self.moves.set_costs(is_valid, costs, state.c, gold)
-            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in unseen_classes:
-                    unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
-            c_d_scores += d_scores.shape[1]
-        # Note that we don't normalize this. See comment in update() for why.
-        if losses is not None:
-            losses.setdefault(self.name, 0.)
-            losses[self.name] += (d_scores**2).sum()
-        return d_scores
+        raise NotImplementedError

    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)
@ -568,7 +585,7 @@ cdef class Parser(TrainablePipe):
            for example in islice(get_examples(), 10):
                doc_sample.append(example.predicted)
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(doc_sample)
+        self.model.initialize((doc_sample, self.moves))
        if nlp is not None:
            self.init_multitask_objectives(get_examples, nlp.pipeline)

@ -625,28 +642,63 @@ cdef class Parser(TrainablePipe):
                    raise ValueError(Errors.E149) from None
        return self

-    def _init_gold_batch(self, examples, max_length):
-        """Make a square batch, of length equal to the shortest transition
+    def _init_batch(self, teacher_step_model, docs, max_length):
+        """Make a square batch of length equal to the shortest transition
        sequence or a cap. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
        where N is the shortest doc. We'll make two states, one representing
-        long_doc[:N], and another representing long_doc[N:]."""
+        long_doc[:N], and another representing long_doc[N:]. In contrast to
+        _init_gold_batch, this version uses a teacher model to generate the
+        cut sequences."""
        cdef:
            StateClass start_state
            StateClass state
            Transition action
-        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+        all_states = self.moves.init_batch(docs)
+        states = []
+        to_cut = []
+        for state, doc in zip(all_states, docs):
+            if not state.is_final():
+                if len(doc) < max_length:
+                    states.append(state)
+                else:
+                    to_cut.append(state)
+        while to_cut:
+            states.extend(state.copy() for state in to_cut)
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
+        return states
+
+
+    def _init_gold_batch(self, examples, max_length):
+        """Make a square batch, of length equal to the shortest transition
+        sequence or a cap. A long doc will get multiple states. Let's say we
+        have a doc of length 2*N, where N is the shortest doc. We'll make
+        two states, one representing long_doc[:N], and another representing
+        long_doc[N:]."""
+        cdef:
+            StateClass start_state
+            StateClass state
+            Transition action
+            TransitionSystem moves = self.moves
+        all_states = moves.init_batch([eg.predicted for eg in examples])
        states = []
        golds = []
        to_cut = []
        for state, eg in zip(all_states, examples):
-            if self.moves.has_gold(eg) and not state.is_final():
-                gold = self.moves.init_gold(state, eg)
+            if moves.has_gold(eg) and not state.is_final():
+                gold = moves.init_gold(state, eg)
                if len(eg.x) < max_length:
                    states.append(state)
                    golds.append(gold)
                else:
-                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                    oracle_actions = moves.get_oracle_sequence_from_state(
                        state.copy(), gold)
                    to_cut.append((eg, state, gold, oracle_actions))
        if not to_cut:
@ -656,13 +708,52 @@ cdef class Parser(TrainablePipe):
            for i in range(0, len(oracle_actions), max_length):
                start_state = state.copy()
                for clas in oracle_actions[i:i+max_length]:
-                    action = self.moves.c[clas]
+                    action = moves.c[clas]
                    action.do(state.c, action.label)
                    if state.is_final():
                        break
-                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if moves.has_gold(eg, start_state.B(0), state.B(0)):
                    states.append(start_state)
                    golds.append(gold)
                if state.is_final():
                    break
        return states, golds, max_length
+
+
+@contextlib.contextmanager
+def _change_attrs(model, **kwargs):
+    """Temporarily modify a thinc model's attributes."""
+    unset = object()
+    old_attrs = {}
+    for key, value in kwargs.items():
+        old_attrs[key] = model.attrs.get(key, unset)
+        model.attrs[key] = value
+    yield model
+    for key, value in old_attrs.items():
+        if value is unset:
+            model.attrs.pop(key)
+        else:
+            model.attrs[key] = value
+
+
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
+    cdef int step
+    cdef StateClass state
+    cdef StateC* c_state
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for state in states:
+            c_state = state.c
+            if step < c_state.history.size():
+                step_actions.append(c_state.history[step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -156,12 +156,22 @@ def validate_token_pattern(obj: list) -> List[str]:


 class TokenPatternString(BaseModel):
-    REGEX: Optional[StrictStr] = Field(None, alias="regex")
+    REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex")
    IN: Optional[List[StrictStr]] = Field(None, alias="in")
    NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
+    FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
+    FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
+    FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
+    FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
+    FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
+    FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
+    FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
+    FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
+    FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
+    FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")

    class Config:
        extra = "forbid"
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -174,7 +174,7 @@ class Scorer:
            prf_score.score_set(pred_spans, gold_spans)
        if len(acc_score) > 0:
            return {
-                "token_acc": acc_score.fscore,
+                "token_acc": acc_score.precision,
                "token_p": prf_score.precision,
                "token_r": prf_score.recall,
                "token_f": prf_score.fscore,
@ -476,14 +476,12 @@ class Scorer:
        f_per_type = {label: PRFScore() for label in labels}
        auc_per_type = {label: ROCAUCScore() for label in labels}
        labels = set(labels)
-        if labels:
-            for eg in examples:
-                labels.update(eg.predicted.cats.keys())
-                labels.update(eg.reference.cats.keys())
        for example in examples:
            # Through this loop, None in the gold_cats indicates missing label.
            pred_cats = getter(example.predicted, attr)
+            pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
            gold_cats = getter(example.reference, attr)
+            gold_cats = {k: v for k, v in gold_cats.items() if k in labels}

            for label in labels:
                pred_score = pred_cats.get(label, 0.0)
--- a/spacy/tests/matcher/test_levenshtein.py
+++ b/spacy/tests/matcher/test_levenshtein.py
@ -1,5 +1,6 @@
 import pytest
 from spacy.matcher import levenshtein
+from spacy.matcher.levenshtein import levenshtein_compare


 # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
 )
 def test_levenshtein(dist, a, b):
    assert levenshtein(a, b) == dist
+
+
+@pytest.mark.parametrize(
+    "a,b,fuzzy,expected",
+    [
+        ("a", "a", 1, True),
+        ("a", "a", 0, True),
+        ("a", "a", -1, True),
+        ("a", "ab", 1, True),
+        ("a", "ab", 0, False),
+        ("a", "ab", -1, True),
+        ("ab", "ac", 1, True),
+        ("ab", "ac", -1, True),
+        ("abc", "cde", 4, True),
+        ("abc", "cde", -1, False),
+        ("abcdef", "cdefgh", 4, True),
+        ("abcdef", "cdefgh", 3, False),
+        ("abcdef", "cdefgh", -1, False),  # default (2 for length 6)
+        ("abcdefgh", "cdefghijk", 5, True),
+        ("abcdefgh", "cdefghijk", 4, False),
+        ("abcdefgh", "cdefghijk", -1, False),  # default (2)
+        ("abcdefgh", "cdefghijkl", 6, True),
+        ("abcdefgh", "cdefghijkl", 5, False),
+        ("abcdefgh", "cdefghijkl", -1, False),  # default (2)
+    ],
+)
+def test_levenshtein_compare(a, b, fuzzy, expected):
+    assert levenshtein_compare(a, b, fuzzy) == expected
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -115,6 +115,155 @@ def test_matcher_match_multi(matcher):
    ]


+@pytest.mark.parametrize(
+    "rules,match_locs",
+    [
+        (
+            {
+                "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+            },
+            [(2, 4)],
+        ),
+        (
+            {
+                "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+            },
+            [(5, 6)],
+        ),
+        (
+            {
+                "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
+                "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+                "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+            },
+            [(2, 4), (5, 6), (8, 9)],
+        ),
+        # only the second pattern matches (check that predicate keys used for
+        # caching don't collide)
+        (
+            {
+                "A": [[{"ORTH": {"FUZZY": "Javascripts"}}]],
+                "B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]],
+            },
+            [(8, 9)],
+        ),
+    ],
+)
+def test_matcher_match_fuzzy(en_vocab, rules, match_locs):
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(en_vocab, words=words)
+
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+    assert match_locs == [(start, end) for m_id, start, end in matcher(doc)]
+
+
+@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
+def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(en_vocab, words=words)
+    assert len(matcher(doc)) == 1
+
+
+def test_matcher_match_fuzzy_set_multiple(en_vocab):
+    rules = {
+        "GoogleNow": [
+            [
+                {
+                    "ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
+                    "OP": "+",
+                }
+            ]
+        ]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 3, 4),
+    ]
+
+
+@pytest.mark.parametrize("fuzzyn", range(1, 10))
+def test_matcher_match_fuzzyn_all_insertions(en_vocab, fuzzyn):
+    matcher = Matcher(en_vocab)
+    matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
+    # words with increasing edit distance
+    words = ["GoogleNow" + "a" * i for i in range(0, 10)]
+    doc = Doc(en_vocab, words)
+    assert len(matcher(doc)) == fuzzyn + 1
+
+
+@pytest.mark.parametrize("fuzzyn", range(1, 6))
+def test_matcher_match_fuzzyn_various_edits(en_vocab, fuzzyn):
+    matcher = Matcher(en_vocab)
+    matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
+    # words with increasing edit distance of different edit types
+    words = [
+        "GoogleNow",
+        "GoogleNuw",
+        "GoogleNuew",
+        "GoogleNoweee",
+        "GiggleNuw3",
+        "gouggle5New",
+    ]
+    doc = Doc(en_vocab, words)
+    assert len(matcher(doc)) == fuzzyn + 1
+
+
+@pytest.mark.parametrize("greedy", ["FIRST", "LONGEST"])
+@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
+def test_matcher_match_fuzzyn_set_op_longest(en_vocab, greedy, set_op):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy=greedy)
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    spans = matcher(doc, as_spans=True)
+    assert len(spans) == 1
+    if set_op == "IN":
+        assert spans[0].text == "Goggle Noo"
+    else:
+        assert spans[0].text == "They like"
+
+
+def test_matcher_match_fuzzyn_set_multiple(en_vocab):
+    rules = {
+        "GoogleNow": [
+            [
+                {
+                    "ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
+                    "OP": "+",
+                }
+            ]
+        ]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 3, 4),
+    ]
+
+
 def test_matcher_empty_dict(en_vocab):
    """Test matcher allows empty token specs, meaning match on any token."""
    matcher = Matcher(en_vocab)
@ -434,6 +583,30 @@ def test_matcher_regex(en_vocab):
    assert len(matches) == 0


+def test_matcher_regex_set_in(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"ORTH": {"REGEX": {"IN": [r"(?:a)", r"(?:an)"]}}}]
+    matcher.add("A_OR_AN", [pattern])
+    doc = Doc(en_vocab, words=["an", "a", "hi"])
+    matches = matcher(doc)
+    assert len(matches) == 2
+    doc = Doc(en_vocab, words=["bye"])
+    matches = matcher(doc)
+    assert len(matches) == 0
+
+
+def test_matcher_regex_set_not_in(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"ORTH": {"REGEX": {"NOT_IN": [r"(?:a)", r"(?:an)"]}}}]
+    matcher.add("A_OR_AN", [pattern])
+    doc = Doc(en_vocab, words=["an", "a", "hi"])
+    matches = matcher(doc)
+    assert len(matches) == 1
+    doc = Doc(en_vocab, words=["bye"])
+    matches = matcher(doc)
+    assert len(matches) == 1
+
+
 def test_matcher_regex_shape(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
 import logging

 from ..util import make_tempdir
@ -412,7 +413,7 @@ def test_train_empty():
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
@ -539,11 +540,11 @@ def test_block_ner():
    assert [token.ent_type_ for token in doc] == expected_types


-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
    # Simple test to try and quickly overfit the NER component
    nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper):
        assert ents2[0].label_ == "LOC"
        # Ensure that the predictions are still the same, even after adding a new label
        ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
        ner2.add_label("RANDOM_NEW_LABEL")
        doc3 = nlp2(test_text)
        ents3 = doc3.ents
@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
    assert ents[1].kb_id == 0


+def test_is_distillable():
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    assert ner.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_ner = teacher.add_pipe("ner")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            teacher_ner.add_label(ent[2])
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.00001
+
+    student = English()
+    student_ner = student.add_pipe("ner")
+    student_ner.initialize(
+        get_examples=lambda: train_examples, labels=teacher_ner.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(100):
+        losses = {}
+        student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.0001
+
+    # test the trained model
+    test_text = "I like London."
+    doc = student(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+
+
 def test_beam_ner_scores():
    # Test that we can get confidence values out of the beam_ner pipe
    beam_width = 16
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -1,13 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam

 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed

 from ...pipeline import DependencyParser
 from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
@ -59,6 +63,8 @@ PARTIAL_DATA = [
    ),
 ]

+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1


@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
    assert doc[0].dep != 0


+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
@pytest.mark.skip(
    reason="The step_through API was removed (but should be brought back)"
 )
@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab):
    DependencyParser(en_vocab, model)


-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
    # Test that the parser works with incomplete information
    nlp = English()
@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name):
    assert doc[2].head.i == 1


-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
    # Simple test to try and quickly overfit the dependency parser (normal or beam)
    nlp = English()
    parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -396,16 +457,67 @@ def test_overfitting_IO(pipe_name):
    assert_equal(batch_deps_1, no_batch_deps)


+def test_is_distillable():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+    assert parser.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_parser = teacher.add_pipe("parser")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            teacher_parser.add_label(dep)
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(200):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["parser"] < 0.0001
+
+    student = English()
+    student_parser = student.add_pipe("parser")
+    student_parser.initialize(
+        get_examples=lambda: train_examples, labels=teacher_parser.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(200):
+        losses = {}
+        student_parser.distill(
+            teacher_parser, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["parser"] < 0.0001
+
+    test_text = "I like securities."
+    doc = student(test_text)
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
+
+
 # fmt: off
@pytest.mark.slow
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
@pytest.mark.parametrize(
    "parser_config",
    [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
    ],
 )
 # fmt: on
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -195,6 +195,53 @@ def test_overfitting_IO():
    assert doc4[3].lemma_ == "egg"


+def test_is_distillable():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    assert lemmatizer.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
+    teacher_lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    student = English()
+    student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+    student_lemmatizer.min_tree_freq = 1
+    student_lemmatizer.initialize(
+        get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_lemmatizer.distill(
+            teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    test_text = "She likes blue eggs"
+    doc = student(test_text)
+    assert doc[0].lemma_ == "she"
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+    assert doc[3].lemma_ == "egg"
+
+
 def test_lemmatizer_requires_labels():
    nlp = English()
    nlp.add_pipe("trainable_lemmatizer")
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -353,6 +353,39 @@ def test_entity_ruler_overlapping_spans(nlp):
    assert doc.ents[0].label_ == "FOOBAR"


+def test_entity_ruler_fuzzy_pipe(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
+    patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
+    ruler.add_patterns(patterns)
+    doc = nlp("helloo")
+    assert len(doc.ents) == 1
+    assert doc.ents[0].label_ == "HELLO"
+
+
+def test_entity_ruler_fuzzy(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
+    patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
+    ruler.add_patterns(patterns)
+    doc = nlp("helloo")
+    assert len(doc.ents) == 1
+    assert doc.ents[0].label_ == "HELLO"
+
+
+def test_entity_ruler_fuzzy_disabled(nlp):
+    @registry.misc("test_fuzzy_compare_disabled")
+    def make_test_fuzzy_compare_disabled():
+        return lambda x, y, z: False
+
+    ruler = nlp.add_pipe(
+        "entity_ruler",
+        config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
+    )
+    patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
+    ruler.add_patterns(patterns)
+    doc = nlp("helloo")
+    assert len(doc.ents) == 0
+
+
@pytest.mark.parametrize("n_process", [1, 2])
 def test_entity_ruler_multiprocessing(nlp, n_process):
    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -50,6 +50,12 @@ def test_implicit_label():
    nlp.initialize(get_examples=lambda: train_examples)


+def test_is_distillable():
+    nlp = English()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.is_distillable
+
+
 def test_no_resize():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -11,6 +11,12 @@ from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir


+def test_is_distillable():
+    nlp = English()
+    senter = nlp.add_pipe("senter")
+    assert senter.is_distillable
+
+
 def test_label_types():
    nlp = Language()
    senter = nlp.add_pipe("senter")
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -24,7 +24,9 @@ def test_issue4348():
    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
-        batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        batches = util.minibatch(
+            TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)

@ -213,6 +215,52 @@ def test_overfitting_IO():
    assert doc3[0].tag_ != "N"


+def test_is_distillable():
+    nlp = English()
+    tagger = nlp.add_pipe("tagger")
+    assert tagger.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["tagger"] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_tagger.distill(
+            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+
 def test_save_activations():
    # Test if activations are correctly added to Doc when requested.
    nlp = English()
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -91,7 +91,9 @@ def test_issue3611():
        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )

            for batch in batches:
                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@ -128,7 +130,9 @@ def test_issue4030():
        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )

            for batch in batches:
                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@ -565,6 +569,12 @@ def test_initialize_examples(name, get_examples, train_data):
        nlp.initialize(get_examples=get_examples())


+def test_is_distillable():
+    nlp = English()
+    textcat = nlp.add_pipe("textcat")
+    assert not textcat.is_distillable
+
+
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
@ -934,3 +944,26 @@ def test_save_activations_multi():
    doc = nlp("This is a test.")
    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
+
+
+@pytest.mark.parametrize(
+    "component_name,scorer",
+    [
+        ("textcat", "spacy.textcat_scorer.v1"),
+        ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
+    ],
+)
+def test_textcat_legacy_scorers(component_name, scorer):
+    """Check that legacy scorers are registered and produce the expected score
+    keys."""
+    nlp = English()
+    nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -382,7 +382,7 @@ cfg_string_multi = """
    factory = "ner"

    [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"

    [components.ner.model.tok2vec]
    @architectures = "spacy.Tok2VecListener.v1"
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -122,33 +122,11 @@ width = ${components.tok2vec.model.width}

 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false

 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@ -179,7 +157,6 @@ def my_parser():
        extra_state_tokens=True,
        hidden_width=65,
        maxout_pieces=5,
-        use_upper=True,
    )
    return parser

@ -285,15 +262,16 @@ def test_serialize_custom_nlp():
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")


-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
    """Create a non-default parser config to check nlp serializes it correctly"""
    nlp = English()
@ -306,11 +284,13 @@ def test_serialize_parser(parser_config_string):
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")


 def test_config_nlp_roundtrip():
@ -457,9 +437,7 @@ def test_config_auto_fill_extra_fields():
    load_model_from_config(nlp.config)


-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
    nlp = English()
    config = Config().from_str(parser_config_string)
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -4,6 +4,7 @@ from collections import Counter
 from typing import Tuple, List, Dict, Any
 import pkg_resources
 import time
+from pathlib import Path

 import spacy
 import numpy
@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError

 from spacy import about
 from spacy.cli import info
-from spacy.cli._util import is_subpath_of, load_project_config
+from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
@ -1185,3 +1186,26 @@ def test_upload_download_local_file():
        download_file(remote_file, local_file)
        with local_file.open(mode="r") as file_:
            assert file_.read() == content
+
+
+def test_walk_directory():
+    with make_tempdir() as d:
+        files = [
+            "data1.iob",
+            "data2.iob",
+            "data3.json",
+            "data4.conll",
+            "data5.conll",
+            "data6.conll",
+            "data7.txt",
+        ]
+
+        for f in files:
+            Path(d / f).touch()
+
+        assert (len(walk_directory(d))) == 7
+        assert (len(walk_directory(d, suffix=None))) == 7
+        assert (len(walk_directory(d, suffix="json"))) == 1
+        assert (len(walk_directory(d, suffix="iob"))) == 2
+        assert (len(walk_directory(d, suffix="conll"))) == 3
+        assert (len(walk_directory(d, suffix="pdf"))) == 0
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -0,0 +1,33 @@
+import os
+from pathlib import Path
+from typer.testing import CliRunner
+
+from spacy.cli._util import app
+from .util import make_tempdir
+
+
+def test_convert_auto():
+    with make_tempdir() as d_in, make_tempdir() as d_out:
+        for f in ["data1.iob", "data2.iob", "data3.iob"]:
+            Path(d_in / f).touch()
+
+        # ensure that "automatic" suffix detection works
+        result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
+        assert "Generated output file" in result.stdout
+        out_files = os.listdir(d_out)
+        assert len(out_files) == 3
+        assert "data1.spacy" in out_files
+        assert "data2.spacy" in out_files
+        assert "data3.spacy" in out_files
+
+
+def test_convert_auto_conflict():
+    with make_tempdir() as d_in, make_tempdir() as d_out:
+        for f in ["data1.iob", "data2.iob", "data3.json"]:
+            Path(d_in / f).touch()
+
+        # ensure that "automatic" suffix detection warns when there are different file types
+        result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
+        assert "All input files must be same type" in result.stdout
+        out_files = os.listdir(d_out)
+        assert len(out_files) == 0
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -3,6 +3,7 @@ import logging
 from unittest import mock
 import pytest
 from spacy.language import Language
+from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.training import Example
@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp):
    nlp.evaluate([Example.from_dict(doc, annots)])


+def test_evaluate_textcat_multilabel(en_vocab):
+    """Test that evaluate works with a multilabel textcat pipe."""
+    nlp = Language(en_vocab)
+    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
+    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
+        textcat_multilabel.add_label(label)
+    nlp.initialize()
+
+    annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
+    doc = nlp.make_doc("hello world")
+    example = Example.from_dict(doc, annots)
+    scores = nlp.evaluate([example])
+    labels = nlp.get_pipe("textcat_multilabel").labels
+    for label in labels:
+        assert scores["cats_f_per_type"].get(label) is not None
+    for key in example.reference.cats.keys():
+        if key not in labels:
+            assert scores["cats_f_per_type"].get(key) is None
+
+
+def test_evaluate_multiple_textcat_final(en_vocab):
+    """Test that evaluate evaluates the final textcat component in a pipeline
+    with more than one textcat or textcat_multilabel."""
+    nlp = Language(en_vocab)
+    textcat = nlp.add_pipe("textcat")
+    for label in ("POSITIVE", "NEGATIVE"):
+        textcat.add_label(label)
+    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
+    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
+        textcat_multilabel.add_label(label)
+    nlp.initialize()
+
+    annots = {
+        "cats": {
+            "POSITIVE": 1.0,
+            "NEGATIVE": 0.0,
+            "FEATURE": 1.0,
+            "QUESTION": 1.0,
+            "POSITIVE": 1.0,
+            "NEGATIVE": 0.0,
+        }
+    }
+    doc = nlp.make_doc("hello world")
+    example = Example.from_dict(doc, annots)
+    scores = nlp.evaluate([example])
+    # get the labels from the final pipe
+    labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
+    for label in labels:
+        assert scores["cats_f_per_type"].get(label) is not None
+    for key in example.reference.cats.keys():
+        if key not in labels:
+            assert scores["cats_f_per_type"].get(key) is None
+
+
+def test_evaluate_multiple_textcat_separate(en_vocab):
+    """Test that evaluate can evaluate multiple textcat components separately
+    with custom scorers."""
+
+    def custom_textcat_score(examples, **kwargs):
+        scores = Scorer.score_cats(
+            examples,
+            "cats",
+            multi_label=False,
+            **kwargs,
+        )
+        return {f"custom_{k}": v for k, v in scores.items()}
+
+    @spacy.registry.scorers("test_custom_textcat_scorer")
+    def make_custom_textcat_scorer():
+        return custom_textcat_score
+
+    nlp = Language(en_vocab)
+    textcat = nlp.add_pipe(
+        "textcat",
+        config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
+    )
+    for label in ("POSITIVE", "NEGATIVE"):
+        textcat.add_label(label)
+    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
+    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
+        textcat_multilabel.add_label(label)
+    nlp.initialize()
+
+    annots = {
+        "cats": {
+            "POSITIVE": 1.0,
+            "NEGATIVE": 0.0,
+            "FEATURE": 1.0,
+            "QUESTION": 1.0,
+            "POSITIVE": 1.0,
+            "NEGATIVE": 0.0,
+        }
+    }
+    doc = nlp.make_doc("hello world")
+    example = Example.from_dict(doc, annots)
+    scores = nlp.evaluate([example])
+    # check custom scores for the textcat pipe
+    assert "custom_cats_f_per_type" in scores
+    labels = nlp.get_pipe("textcat").labels
+    assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
+    # check default scores for the textcat_multilabel pipe
+    assert "cats_f_per_type" in scores
+    labels = nlp.get_pipe("textcat_multilabel").labels
+    assert set(scores["cats_f_per_type"].keys()) == set(labels)
+
+
 def vector_modification_pipe(doc):
    doc.vector += 1
    return doc
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -5,10 +5,8 @@ from pathlib import Path
 from spacy.about import __version__ as spacy_version
 from spacy import util
 from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
 from thinc.api import Config, Optimizer, ConfigValidationError
 from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
@ -81,34 +79,6 @@ def test_util_get_package_path(package):
    assert isinstance(path, Path)


-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
    current_ops = get_current_ops()
    if has_cupy_gpu:
@ -434,3 +404,16 @@ def test_to_ternary_int():
    assert to_ternary_int(-10) == -1
    assert to_ternary_int("string") == -1
    assert to_ternary_int([0, "string"]) == -1
+
+
+def test_find_available_port():
+    host = "0.0.0.0"
+    port = 5000
+    assert find_available_port(port, host) == port, "Port 5000 isn't free"
+
+    from wsgiref.simple_server import make_server, demo_app
+
+    with make_server(host, port, demo_app) as httpd:
+        with pytest.warns(UserWarning, match="already in use"):
+            found_port = find_available_port(port, host, auto_select=True)
+        assert found_port == port + 1, "Didn't find next port"
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -110,7 +110,7 @@ def test_tokenization(sented_doc):
    )
    example.predicted[1].is_sent_start = False
    scores = scorer.score([example])
-    assert scores["token_acc"] == approx(0.66666666)
+    assert scores["token_acc"] == 0.5
    assert scores["token_p"] == 0.5
    assert scores["token_r"] == approx(0.33333333)
    assert scores["token_f"] == 0.4
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -8,7 +8,7 @@ from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
 from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
-from spacy.training import offsets_to_biluo_tags
+from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.converters import json_to_docs
@ -365,6 +365,19 @@ def test_example_from_dict_some_ner(en_vocab):
    assert ner_tags == ["U-LOC", None, None, None]


+def test_validate_distillation_examples(en_vocab):
+    words = ["a", "b", "c", "d"]
+    spaces = [True, True, False, True]
+    predicted = Doc(en_vocab, words=words, spaces=spaces)
+
+    example = Example.from_dict(predicted, {})
+    validate_distillation_examples([example], "test_validate_distillation_examples")
+
+    example = Example.from_dict(predicted, {"words": words + ["e"]})
+    with pytest.raises(ValueError, match=r"distillation"):
+        validate_distillation_examples([example], "test_validate_distillation_examples")
+
+
@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_json_to_docs_no_ner(en_vocab):
    data = [
@ -905,7 +918,9 @@ def _train_tuples(train_data):
    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
-        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
+        batches = minibatch(
+            train_examples, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)

--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -4,7 +4,6 @@ from cymem.cymem cimport Pool

 from .typedefs cimport hash_t
 from .structs cimport LexemeC, SpanC, TokenC
-from .strings cimport StringStore
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
@ -1,5 +1,6 @@
 from .corpus import Corpus, JsonlCorpus  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@ -2,12 +2,13 @@ from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator
 from typing import Optional, Any
 from functools import partial
 import itertools
-from thinc.schedules import Schedule, constant as constant_schedule
+from thinc.schedules import Schedule

 from ..util import registry, minibatch


-Sizing = Union[Sequence[int], int, Schedule[int]]
+SizingSchedule = Union[Iterable[int], int, Schedule]
+Sizing = Union[Iterable[int], int]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]

@ -15,7 +16,7 @@ BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
@registry.batchers("spacy.batch_by_padded.v1")
 def configure_minibatch_by_padded_size(
    *,
-    size: Sizing,
+    size: SizingSchedule,
    buffer: int,
    discard_oversize: bool,
    get_length: Optional[Callable[[ItemT], int]] = None
@ -25,8 +26,8 @@ def configure_minibatch_by_padded_size(
    The padded size is defined as the maximum length of sequences within the
    batch multiplied by the number of sequences in the batch.

-    size (int or Sequence[int]): The largest padded size to batch sequences into.
-        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    size (int, Iterable[int] or Schedule): The largest padded size to batch sequences
+        into. Can be a single integer, or a sequence, allowing for variable batch sizes.
    buffer (int): The number of sequences to accumulate before sorting by length.
        A larger buffer will result in more even sizing, but if the buffer is
        very large, the iteration order will be less random, which can result
@ -40,7 +41,7 @@ def configure_minibatch_by_padded_size(
    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(
        minibatch_by_padded_size,
-        size=size,
+        size=_schedule_to_sizing(size),
        buffer=buffer,
        discard_oversize=discard_oversize,
        **optionals
@ -50,14 +51,14 @@ def configure_minibatch_by_padded_size(
@registry.batchers("spacy.batch_by_words.v1")
 def configure_minibatch_by_words(
    *,
-    size: Sizing,
+    size: SizingSchedule,
    tolerance: float,
    discard_oversize: bool,
    get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
    """Create a batcher that uses the "minibatch by words" strategy.

-    size (int or Sequence[int]): The target number of words per batch.
+    size (int, Iterable[int] or Schedule): The target number of words per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    tolerance (float): What percentage of the size to allow batches to exceed.
    discard_oversize (bool): Whether to discard sequences that by themselves
@ -68,7 +69,7 @@ def configure_minibatch_by_words(
    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(
        minibatch_by_words,
-        size=size,
+        size=_schedule_to_sizing(size),
        tolerance=tolerance,
        discard_oversize=discard_oversize,
        **optionals
@ -77,15 +78,15 @@ def configure_minibatch_by_words(

@registry.batchers("spacy.batch_by_sequence.v1")
 def configure_minibatch(
-    size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
+    size: SizingSchedule, get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
    """Create a batcher that creates batches of the specified size.

-    size (int or Sequence[int]): The target number of items per batch.
+    size (int, Iterable[int] or Schedule): The target number of items per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    """
    optionals = {"get_length": get_length} if get_length is not None else {}
-    return partial(minibatch, size=size, **optionals)
+    return partial(minibatch, size=_schedule_to_sizing(size), **optionals)


 def minibatch_by_padded_size(
@ -101,7 +102,7 @@ def minibatch_by_padded_size(
    The padded size is defined as the maximum length of sequences within the
    batch multiplied by the number of sequences in the batch.

-    size (int or Sequence[int]): The largest padded size to batch sequences into.
+    size (int or Iterable[int]): The largest padded size to batch sequences into.
    buffer (int): The number of sequences to accumulate before sorting by length.
        A larger buffer will result in more even sizing, but if the buffer is
        very large, the iteration order will be less random, which can result
@ -112,13 +113,12 @@ def minibatch_by_padded_size(
        The `len` function is used by default.
    """
    if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
    else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
+        size_ = iter(size)
+    for outer_batch in minibatch(seqs, size=buffer):
        outer_batch = list(outer_batch)
-        target_size = size_(step)
+        target_size = next(size_)
        for indices in _batch_by_length(outer_batch, target_size, get_length):
            subbatch = [outer_batch[i] for i in indices]
            padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@ -140,7 +140,7 @@ def minibatch_by_words(
    themselves, or be discarded if discard_oversize=True.

    seqs (Iterable[Sequence]): The sequences to minibatch.
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int or Iterable[int]): The target number of words per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    tolerance (float): What percentage of the size to allow batches to exceed.
    discard_oversize (bool): Whether to discard sequences that by themselves
@ -149,12 +149,10 @@ def minibatch_by_words(
        item. The `len` function is used by default.
    """
    if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
    else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    step = 0
-    target_size = size_(step)
+        size_ = iter(size)
+    target_size = next(size_)
    tol_size = target_size * tolerance
    batch = []
    overflow = []
@ -179,8 +177,7 @@ def minibatch_by_words(
        else:
            if batch:
                yield batch
-            step += 1
-            target_size = size_(step)
+            target_size = next(size_)
            tol_size = target_size * tolerance
            batch = overflow
            batch_size = overflow_size
@ -198,8 +195,7 @@ def minibatch_by_words(
            else:
                if batch:
                    yield batch
-                step += 1
-                target_size = size_(step)
+                target_size = next(size_)
                tol_size = target_size * tolerance
                batch = [seq]
                batch_size = n_words
@ -236,3 +232,9 @@ def _batch_by_length(
    batches = [list(sorted(batch)) for batch in batches]
    batches.reverse()
    return batches
+
+
+def _schedule_to_sizing(size: SizingSchedule) -> Sizing:
+    if isinstance(size, Schedule):
+        return size.to_generator()
+    return size
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -1,5 +1,4 @@
 from collections.abc import Iterable as IterableInstance
-import warnings
 import numpy
 from murmurhash.mrmr cimport hash64

@ -47,6 +46,13 @@ def validate_examples(examples, method):
        raise TypeError(err)


+def validate_distillation_examples(examples, method):
+    validate_examples(examples, method)
+    for eg in examples:
+        if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
+            raise ValueError(Errors.E4003)
+
+
 def validate_get_examples(get_examples, method):
    """Check that a generator of a batch of examples received during processing is valid:
    the callable produces a non-empty list of Example objects.
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -26,6 +26,8 @@ def setup_table(
    return final_cols, final_widths, ["r" for _ in final_widths]


+# We cannot rename this method as it's directly imported
+# and used by external packages such as spacy-loggers.
@registry.loggers("spacy.ConsoleLogger.v2")
 def console_logger(
    progress_bar: bool = False,
@ -33,7 +35,27 @@ def console_logger(
    output_file: Optional[Union[str, Path]] = None,
 ):
    """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
-    progress_bar (bool): Whether the logger should print the progress bar.
+    progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
+    console_output (bool): Whether the logger should print the logs on the console.
+    output_file (Optional[Union[str, Path]]): The file to save the training logs to.
+    """
+    return console_logger_v3(
+        progress_bar=None if progress_bar is False else "eval",
+        console_output=console_output,
+        output_file=output_file,
+    )
+
+
+@registry.loggers("spacy.ConsoleLogger.v3")
+def console_logger_v3(
+    progress_bar: Optional[str] = None,
+    console_output: bool = True,
+    output_file: Optional[Union[str, Path]] = None,
+):
+    """The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
+    progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
+        train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
+        eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
    console_output (bool): Whether the logger should print the logs on the console.
    output_file (Optional[Union[str, Path]]): The file to save the training logs to.
    """
@ -70,6 +92,7 @@ def console_logger(
            for name, proc in nlp.pipeline
            if hasattr(proc, "is_trainable") and proc.is_trainable
        ]
+        max_steps = nlp.config["training"]["max_steps"]
        eval_frequency = nlp.config["training"]["eval_frequency"]
        score_weights = nlp.config["training"]["score_weights"]
        score_cols = [col for col, value in score_weights.items() if value is not None]
@ -84,6 +107,13 @@ def console_logger(
            write(msg.row(table_header, widths=table_widths, spacing=spacing))
            write(msg.row(["-" * width for width in table_widths], spacing=spacing))
        progress = None
+        expected_progress_types = ("train", "eval")
+        if progress_bar is not None and progress_bar not in expected_progress_types:
+            raise ValueError(
+                Errors.E1048.format(
+                    unexpected=progress_bar, expected=expected_progress_types
+                )
+            )

        def log_step(info: Optional[Dict[str, Any]]) -> None:
            nonlocal progress
@ -141,11 +171,23 @@ def console_logger(
                    )
                )
                if progress_bar:
+                    if progress_bar == "train":
+                        total = max_steps
+                        desc = f"Last Eval Epoch: {info['epoch']}"
+                        initial = info["step"]
+                    else:
+                        total = eval_frequency
+                        desc = f"Epoch {info['epoch']+1}"
+                        initial = 0
                    # Set disable=None, so that it disables on non-TTY
                    progress = tqdm.tqdm(
-                        total=eval_frequency, disable=None, leave=False, file=stderr
+                        total=total,
+                        disable=None,
+                        leave=False,
+                        file=stderr,
+                        initial=initial,
                    )
-                    progress.set_description(f"Epoch {info['epoch']+1}")
+                    progress.set_description(desc)

        def finalize() -> None:
            if output_stream:
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -100,7 +100,7 @@ def train(
        stdout.write(
            msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
        )
-    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
    with nlp.select_pipes(disable=frozen_components):
        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
    try:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -31,6 +31,7 @@ import shlex
 import inspect
 import pkgutil
 import logging
+import socket

 try:
    import cupy.random
@ -1582,12 +1583,12 @@ def minibatch(items, size):
    so that batch-size can vary on each step.
    """
    if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
    else:
-        size_ = size
+        size_ = iter(size)
    items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
        batch = list(itertools.islice(items, int(batch_size)))
        if len(batch) == 0:
            break
@ -1728,3 +1729,50 @@ def all_equal(iterable):
    (or if the input is an empty sequence), False otherwise."""
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
+
+
+def _is_port_in_use(port: int, host: str = "localhost") -> bool:
+    """Check if 'host:port' is in use. Return True if it is, False otherwise.
+
+    port (int): the port to check
+    host (str): the host to check (default "localhost")
+    RETURNS (bool): Whether 'host:port' is in use.
+    """
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.bind((host, port))
+        return False
+    except socket.error:
+        return True
+    finally:
+        s.close()
+
+
+def find_available_port(start: int, host: str, auto_select: bool = False) -> int:
+    """Given a starting port and a host, handle finding a port.
+
+    If `auto_select` is False, a busy port will raise an error.
+
+    If `auto_select` is True, the next free higher port will be used.
+
+    start (int): the port to start looking from
+    host (str): the host to find a port on
+    auto_select (bool): whether to automatically select a new port if the given port is busy (default False)
+    RETURNS (int): The port to use.
+    """
+    if not _is_port_in_use(start, host):
+        return start
+
+    port = start
+    if not auto_select:
+        raise ValueError(Errors.E1050.format(port=port))
+
+    while _is_port_in_use(port, host) and port < 65535:
+        port += 1
+
+    if port == 65535 and _is_port_in_use(port, host):
+        raise ValueError(Errors.E1049.format(host=host))
+
+    # if we get here, the port changed
+    warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port))
+    return port
--- a/website/.eslintrc.json
+++ b/website/.eslintrc.json
@ -0,0 +1,3 @@
+{
+    "extends": "next/core-web-vitals"
+}
--- a/website/.gitignore
+++ b/website/.gitignore
@ -0,0 +1,44 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.js
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# local env files
+.env*.local
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
+
+!.vscode/extensions.json
+!public
+
+public/robots.txt
+public/sitemap*
+public/sw.js*
+public/workbox*
--- a/website/.nvmrc
+++ b/website/.nvmrc
@ -0,0 +1 @@
+18
--- a/website/.prettierignore
+++ b/website/.prettierignore
@ -0,0 +1 @@
+.next
--- a/website/.prettierrc
+++ b/website/.prettierrc
@ -20,12 +20,11 @@
            }
        },
        {
-            "files": "*.md",
+            "files": ["package.json", "package-lock.json"],
            "options": {
                "tabWidth": 2,
                "printWidth": 80,
-                "proseWrap": "always",
-                "htmlWhitespaceSensitivity": "strict"
+                "proseWrap": "always"
            }
        },
        {
--- a/website/.vscode/extensions.json
+++ b/website/.vscode/extensions.json
@ -0,0 +1,8 @@
+{
+    "recommendations": [
+        "dbaeumer.vscode-eslint",
+        "unifiedjs.vscode-mdx",
+        "esbenp.prettier-vscode",
+        "syler.sass-indented"
+    ]
+}
--- a/website/README.md
+++ b/website/README.md
@ -7,17 +7,16 @@ The styleguide for the spaCy website is available at

 ## Setup and installation

-Before running the setup, make sure your versions of
-[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
-Node v10.15 or later is required.
-
 ```bash
 # Clone the repository
 git clone https://github.com/explosion/spaCy
 cd spaCy/website

-# Install Gatsby's command-line tool
-npm install --global gatsby-cli
+# Switch to the correct Node version
+#
+# If you don't have NVM and don't want to use it, you can manually switch to the Node version
+# stated in /.nvmrc and skip this step
+nvm use

 # Install the dependencies
 npm install
@ -36,8 +35,7 @@ file in the root defines the settings used in this codebase.

 ## Building & developing the site with Docker

-Sometimes it's hard to get a local environment working due to rapid updates to
-node dependencies, so it may be easier to use docker for building the docs.
+While it shouldn't be necessary and is not recommended you can run this site in a Docker container.

 If you'd like to do this, **be sure you do _not_ include your local
 `node_modules` folder**, since there are some dependencies that need to be built
@ -76,12 +74,14 @@ bit of time.
 ```yaml
 ├── docs                 # the actual markdown content
 ├── meta                 # JSON-formatted site metadata
+|   ├── dynamicMeta.js   # At build time generated meta data
 |   ├── languages.json   # supported languages and statistical models
 |   ├── sidebars.json    # sidebar navigations for different sections
 |   ├── site.json        # general site metadata
 |   ├── type-annotations.json # Type annotations
 |   └── universe.json    # data for the spaCy universe section
-├── public               # compiled site
+├── pages                # Next router pages
+├── public               # static images and other assets
 ├── setup                # Jinja setup
 ├── src                  # source
 |   ├── components       # React components
@ -96,9 +96,11 @@ bit of time.
 |   |   └── universe.js  # layout templates for universe
 |   └── widgets          # non-reusable components with content, e.g. changelog
 ├── .eslintrc.json       # ESLint config file
+├── .nvmrc               # NVM config file
+|                        # (to support "nvm use" to switch to correct Node version)
+|
 ├── .prettierrc          # Prettier config file
-├── gatsby-browser.js    # browser-specific hooks for Gatsby
-├── gatsby-config.js     # Gatsby configuration
-├── gatsby-node.js       # Node-specific hooks for Gatsby
-└── package.json         # package settings and dependencies
+├── next.config.mjs      # Next config file
+├── package.json         # package settings and dependencies
+└── tsconfig.json        # TypeScript config file
 ```
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@ -2,42 +2,52 @@

 # spaCy Universe

-The [spaCy Universe](https://spacy.io/universe) collects the many great resources developed with or for spaCy. It
-includes standalone packages, plugins, extensions, educational materials,
-operational utilities and bindings for other languages.
+The [spaCy Universe](https://spacy.io/universe) collects the many great
+resources developed with or for spaCy. It includes standalone packages, plugins,
+extensions, educational materials, operational utilities and bindings for other
+languages.

 If you have a project that you want the spaCy community to make use of, you can
 suggest it by submitting a pull request to this repository. The Universe
 database is open-source and collected in a simple JSON file.

 Looking for inspiration for your own spaCy plugin or extension? Check out the
-[`project ideas`](https://github.com/explosion/spaCy/discussions?discussions_q=category%3A%22New+Features+%26+Project+Ideas%22) 
+[`project ideas`](https://github.com/explosion/spaCy/discussions?discussions_q=category%3A%22New+Features+%26+Project+Ideas%22)
 discussion forum.

 ## Checklist

 ### Projects

-✅ Libraries and packages should be **open-source** (with a user-friendly license) and at least somewhat **documented** (e.g. a simple `README` with usage instructions).
+✅ Libraries and packages should be **open-source** (with a user-friendly
+license) and at least somewhat **documented** (e.g. a simple `README` with usage
+instructions).

-✅ We're happy to include work in progress and prereleases, but we'd like to keep the emphasis on projects that should be useful to the community **right away**.
+✅ We're happy to include work in progress and prereleases, but we'd like to
+keep the emphasis on projects that should be useful to the community **right
+away**.

 ✅ Demos and visualizers should be available via a **public URL**.

 ### Educational Materials

-✅ Books should be **available for purchase or download** (not just pre-order). Ebooks and self-published books are fine, too, if they include enough substantial content.
+✅ Books should be **available for purchase or download** (not just pre-order).
+Ebooks and self-published books are fine, too, if they include enough
+substantial content.

-✅ The `"url"` of book entries should either point to the publisher's website or a reseller of your choice (ideally one that ships worldwide or as close as possible).
+✅ The `"url"` of book entries should either point to the publisher's website or
+a reseller of your choice (ideally one that ships worldwide or as close as
+possible).

-✅ If an online course is only available behind a paywall, it should at least have a **free excerpt** or chapter available, so users know what to expect.
+✅ If an online course is only available behind a paywall, it should at least
+have a **free excerpt** or chapter available, so users know what to expect.

 ## JSON format

-To add a project, fork this repository, edit the [`universe.json`](meta/universe.json)
-and add an object of the following format to the list of `"resources"`. Before
-you submit your pull request, make sure to use a linter to verify that your
-markup is correct.
+To add a project, fork this repository, edit the
+[`universe.json`](meta/universe.json) and add an object of the following format
+to the list of `"resources"`. Before you submit your pull request, make sure to
+use a linter to verify that your markup is correct.

 ```json
 {
@ -69,26 +79,26 @@ markup is correct.
 }
 ```

-|  Field | Type | Description |
-| --- | --- | --- |
-| `id` | string | Unique ID of the project. |
-| `title` | string | Project title. If not set, the `id` will be used as the display title. |
-| `slogan` | string | A short description of the project. Displayed in the overview and under the title. |
-| `description` | string | A longer description of the project. Markdown is allowed, but should be limited to basic formatting like bold, italics, code or links. |
-| `github` | string | Associated GitHub repo in the format `user/repo`. Will be displayed as a link and used for release, license and star badges. |
-| `pip` | string | Package name on pip. If available, the installation command will be displayed. |
-| `cran` | string | For R packages: package name on CRAN. If available, the installation command will be displayed. |
-| `code_example` | array | Short example that shows how to use the project. Formatted as an array with one string per line. |
-| `code_language` | string | Defaults to `'python'`. Optional code language used for syntax highlighting with [Prism](http://prismjs.com/). |
-| `url` | string | Optional project link to display as button. |
-| `thumb` | string | Optional URL to project thumbnail to display in overview and project header. Recommended size is 100x100px. |
-| `image` | string | Optional URL to project image to display with description. |
-| `author` | string | Name(s) of project author(s). |
-| `author_links` | object | Usernames and links to display as icons to author info. Currently supports `twitter` and `github` usernames, as well as `website` link. |
-| `category` | list | One or more categories to assign to project. Must be one of the available options. |
-| `tags` | list | Still experimental and not used for filtering: one or more tags to assign to project. |
+| Field           | Type   | Description                                                                                                                             |
+| --------------- | ------ | --------------------------------------------------------------------------------------------------------------------------------------- |
+| `id`            | string | Unique ID of the project.                                                                                                               |
+| `title`         | string | Project title. If not set, the `id` will be used as the display title.                                                                  |
+| `slogan`        | string | A short description of the project. Displayed in the overview and under the title.                                                      |
+| `description`   | string | A longer description of the project. Markdown is allowed, but should be limited to basic formatting like bold, italics, code or links.  |
+| `github`        | string | Associated GitHub repo in the format `user/repo`. Will be displayed as a link and used for release, license and star badges.            |
+| `pip`           | string | Package name on pip. If available, the installation command will be displayed.                                                          |
+| `cran`          | string | For R packages: package name on CRAN. If available, the installation command will be displayed.                                         |
+| `code_example`  | array  | Short example that shows how to use the project. Formatted as an array with one string per line.                                        |
+| `code_language` | string | Defaults to `'python'`. Optional code language used for syntax highlighting with [Prism](http://prismjs.com/).                          |
+| `url`           | string | Optional project link to display as button.                                                                                             |
+| `thumb`         | string | Optional URL to project thumbnail to display in overview and project header. Recommended size is 100x100px.                             |
+| `image`         | string | Optional URL to project image to display with description.                                                                              |
+| `author`        | string | Name(s) of project author(s).                                                                                                           |
+| `author_links`  | object | Usernames and links to display as icons to author info. Currently supports `twitter` and `github` usernames, as well as `website` link. |
+| `category`      | list   | One or more categories to assign to project. Must be one of the available options.                                                      |
+| `tags`          | list   | Still experimental and not used for filtering: one or more tags to assign to project.                                                   |

 To separate them from the projects, educational materials also specify
-`"type": "education`. Books can also set a `"cover"` field containing a URL
-to a cover image. If available, it's used in the overview and displayed on
-the individual book page.
+`"type": "education`. Books can also set a `"cover"` field containing a URL to a
+cover image. If available, it's used in the overview and displayed on the
+individual book page.
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@ -26,9 +26,9 @@ part of the [training config](/usage/training#custom-functions). Also see the
 usage documentation on
 [layers and model architectures](/usage/layers-architectures).

-## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
+## Tok2Vec architectures {id="tok2vec-arch",source="spacy/ml/models/tok2vec.py"}

-### spacy.Tok2Vec.v2 {#Tok2Vec}
+### spacy.Tok2Vec.v2 {id="Tok2Vec"}

 > #### Example config
 >
@ -56,7 +56,7 @@ blog post for background.
 | `encode`    | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~            |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                           |

-### spacy.HashEmbedCNN.v2 {#HashEmbedCNN}
+### spacy.HashEmbedCNN.v2 {id="HashEmbedCNN"}

 > #### Example Config
 >
@ -89,7 +89,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
 | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  |
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        |

-### spacy.Tok2VecListener.v1 {#Tok2VecListener}
+### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}

 > #### Example config
 >
@ -139,7 +139,7 @@ the `Tok2Vec` component.
 | `upstream`  | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                               |

-### spacy.MultiHashEmbed.v2 {#MultiHashEmbed}
+### spacy.MultiHashEmbed.v2 {id="MultiHashEmbed"}

 > #### Example config
 >
@ -170,7 +170,7 @@ updated).
 | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
 | **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |

-### spacy.CharacterEmbed.v2 {#CharacterEmbed}
+### spacy.CharacterEmbed.v2 {id="CharacterEmbed"}

 > #### Example config
 >
@ -207,7 +207,7 @@ network to construct a single vector to represent the information.
 | `nC`        | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                          |

-### spacy.MaxoutWindowEncoder.v2 {#MaxoutWindowEncoder}
+### spacy.MaxoutWindowEncoder.v2 {id="MaxoutWindowEncoder"}

 > #### Example config
 >
@ -231,7 +231,7 @@ and residual connections.
 | `depth`         | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 | **CREATES**     | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |

-### spacy.MishWindowEncoder.v2 {#MishWindowEncoder}
+### spacy.MishWindowEncoder.v2 {id="MishWindowEncoder"}

 > #### Example config
 >
@ -254,7 +254,7 @@ and residual connections.
 | `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 | **CREATES**   | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |

-### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder}
+### spacy.TorchBiLSTMEncoder.v1 {id="TorchBiLSTMEncoder"}

 > #### Example config
 >
@ -276,7 +276,7 @@ Encode context using bidirectional LSTM layers. Requires
 | `dropout`   | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~                                                                           |
 | **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |

-### spacy.StaticVectors.v2 {#StaticVectors}
+### spacy.StaticVectors.v2 {id="StaticVectors"}

 > #### Example config
 >
@ -306,7 +306,7 @@ mapped to a zero vector. See the documentation on
 | `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |

-### spacy.FeatureExtractor.v1 {#FeatureExtractor}
+### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}

 > #### Example config
 >
@ -324,7 +324,7 @@ of feature names to extract, which should refer to token attributes.
 | `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
 | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |

-## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
+## Transformer architectures {id="transformers",source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}

 The following architectures are provided by the package
 [`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the
@ -341,7 +341,7 @@ for details and system requirements.

 </Infobox>

-### spacy-transformers.TransformerModel.v3 {#TransformerModel}
+### spacy-transformers.TransformerModel.v3 {id="TransformerModel"}

 > #### Example Config
 >
@ -390,7 +390,7 @@ in other components, see
 |                      |                                                                                                                                                                                                                                                       |

 <Infobox title="Mixed precision support" variant="warning">
-Mixed-precision support is currently an experimental feature.
+  Mixed-precision support is currently an experimental feature.
 </Infobox>

 <Accordion title="Previous versions of spacy-transformers.TransformerModel" spaced>
@ -404,7 +404,7 @@ The other arguments are shared between all versions.

 </Accordion>

-### spacy-transformers.TransformerListener.v1 {#TransformerListener}
+### spacy-transformers.TransformerListener.v1 {id="TransformerListener"}

 > #### Example Config
 >
@ -434,7 +434,7 @@ a single token vector given zero or more wordpiece vectors.
 | `upstream`    | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ |
 | **CREATES**   | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                           |

-### spacy-transformers.Tok2VecTransformer.v3 {#Tok2VecTransformer}
+### spacy-transformers.Tok2VecTransformer.v3 {id="Tok2VecTransformer"}

 > #### Example Config
 >
@ -467,7 +467,7 @@ one component.
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        |

 <Infobox title="Mixed precision support" variant="warning">
-Mixed-precision support is currently an experimental feature.
+  Mixed-precision support is currently an experimental feature.
 </Infobox>

 <Accordion title="Previous versions of spacy-transformers.Tok2VecTransformer" spaced>
@ -481,7 +481,7 @@ The other arguments are shared between all versions.

 </Accordion>

-## Pretraining architectures {#pretrain source="spacy/ml/models/multi_task.py"}
+## Pretraining architectures {id="pretrain",source="spacy/ml/models/multi_task.py"}

 The spacy `pretrain` command lets you initialize a `Tok2Vec` layer in your
 pipeline with information from raw text. To this end, additional layers are
@ -494,7 +494,7 @@ BERT.
 For more information, see the section on
 [pretraining](/usage/embeddings-transformers#pretraining).

-### spacy.PretrainVectors.v1 {#pretrain_vectors}
+### spacy.PretrainVectors.v1 {id="pretrain_vectors"}

 > #### Example config
 >
@ -525,7 +525,7 @@ vectors.
 | `loss`          | The loss function can be either "cosine" or "L2". We typically recommend to use "cosine". ~~~str~~                                                        |
 | **CREATES**     | A callable function that can create the Model, given the `vocab` of the pipeline and the `tok2vec` layer to pretrain. ~~Callable[[Vocab, Model], Model]~~ |

-### spacy.PretrainCharacters.v1 {#pretrain_chars}
+### spacy.PretrainCharacters.v1 {id="pretrain_chars"}

 > #### Example config
 >
@ -551,20 +551,19 @@ for a Tok2Vec layer.
 | `n_characters`  | The window of characters - e.g. if `n_characters = 2`, the model will try to predict the first two and last two characters of the word. ~~int~~           |
 | **CREATES**     | A callable function that can create the Model, given the `vocab` of the pipeline and the `tok2vec` layer to pretrain. ~~Callable[[Vocab, Model], Model]~~ |

-## Parser & NER architectures {#parser}
+## Parser & NER architectures {id="parser"}

-### spacy.TransitionBasedParser.v2 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}

 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@ -594,27 +593,26 @@ consists of either two or three subnetworks:
  state representation. If not present, the output from the lower model is used
  as action scores directly.

-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |

 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>

 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.

-</Accordion>
+ </Accordion>

-## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
+## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}

-### spacy.Tagger.v2 {#Tagger}
+### spacy.Tagger.v2 {id="Tagger"}

 > #### Example Config
 >
@ -648,7 +646,7 @@ The other arguments are shared between all versions.

 </Accordion>

-## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
+## Text classification architectures {id="textcat",source="spacy/ml/models/textcat.py"}

 A text classification architecture needs to take a [`Doc`](/api/doc) as input,
 and produce a score for each potential label class. Textcat challenges can be
@ -672,7 +670,7 @@ single-label use-cases where `exclusive_classes = true`, while the

 </Infobox>

-### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
+### spacy.TextCatEnsemble.v2 {id="TextCatEnsemble"}

 > #### Example Config
 >
@ -737,7 +735,7 @@ but used an internal `tok2vec` instead of taking it as argument:

 </Accordion>

-### spacy.TextCatCNN.v2 {#TextCatCNN}
+### spacy.TextCatCNN.v2 {id="TextCatCNN"}

 > #### Example Config
 >
@ -777,7 +775,7 @@ after training.

 </Accordion>

-### spacy.TextCatBOW.v2 {#TextCatBOW}
+### spacy.TextCatBOW.v2 {id="TextCatBOW"}

 > #### Example Config
 >
@ -809,9 +807,9 @@ after training.

 </Accordion>

-## Span classification architectures {#spancat source="spacy/ml/models/spancat.py"}
+## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}

-### spacy.SpanCategorizer.v1 {#SpanCategorizer}
+### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}

 > #### Example Config
 >
@ -848,7 +846,7 @@ single vector, and a scorer model to map the vectors to probabilities.
 | `scorer`    | The scorer model. ~~Model[Floats2d, Floats2d]~~                                 |
 | **CREATES** | The model using the architecture. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |

-### spacy.mean_max_reducer.v1 {#mean_max_reducer}
+### spacy.mean_max_reducer.v1 {id="mean_max_reducer"}

 Reduce sequences by concatenating their mean and max pooled vectors, and then
 combine the concatenated vectors with a hidden layer.
@ -857,7 +855,7 @@ combine the concatenated vectors with a hidden layer.
 | ------------- | ------------------------------------- |
 | `hidden_size` | The size of the hidden layer. ~~int~~ |

-## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
+## Entity linking architectures {id="entitylinker",source="spacy/ml/models/entity_linker.py"}

 An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions
 (tagged as named entities) to unique identifiers, grounding the named entities
@ -870,7 +868,7 @@ into the "real world". This requires 3 main components:
 - A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
  most plausible ID from the set of candidates.

-### spacy.EntityLinker.v2 {#EntityLinker}
+### spacy.EntityLinker.v2 {id="EntityLinker"}

 > #### Example Config
 >
@ -899,7 +897,7 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |

-### spacy.EmptyKB.v1 {#EmptyKB}
+### spacy.EmptyKB.v1 {id="EmptyKB"}

 A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
 instance. This is the default when a new entity linker component is created.
@ -908,7 +906,7 @@ instance. This is the default when a new entity linker component is created.
 | ---------------------- | ----------------------------------------------------------------------------------- |
 | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |

-### spacy.KBFromFile.v1 {#KBFromFile}
+### spacy.KBFromFile.v1 {id="KBFromFile"}

 A function that reads an existing `KnowledgeBase` from file.

@ -916,7 +914,7 @@ A function that reads an existing `KnowledgeBase` from file.
 | --------- | -------------------------------------------------------- |
 | `kb_path` | The location of the KB that was stored to file. ~~Path~~ |

-### spacy.CandidateGenerator.v1 {#CandidateGenerator}
+### spacy.CandidateGenerator.v1 {id="CandidateGenerator"}

 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
 [`Span`](/api/span) object denoting a named entity, and returns a list of
@ -924,7 +922,7 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
 `CandidateGenerator` uses the text of a mention to find its potential aliases in
 the `KnowledgeBase`. Note that this function is case-dependent.

-## Coreference {#coref-architectures tag="experimental"}
+## Coreference {id="coref-architectures",tag="experimental"}

 A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
 the same entity. A [`SpanResolver`](/api/span-resolver) component infers spans
@ -932,7 +930,7 @@ from single tokens. Together these components can be used to reproduce
 traditional coreference models. You can also omit the `SpanResolver` if working
 with only token-level clusters is acceptable.

-### spacy-experimental.Coref.v1 {#Coref tag="experimental"}
+### spacy-experimental.Coref.v1 {id="Coref",tag="experimental"}

 > #### Example Config
 >
@ -967,7 +965,7 @@ The `Coref` model architecture is a Thinc `Model`.
 | `antecedent_batch_size`   | Internal batch size. ~~int~~                                                                                                                                                             |
 | **CREATES**               | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                         |

-### spacy-experimental.SpanResolver.v1 {#SpanResolver tag="experimental"}
+### spacy-experimental.SpanResolver.v1 {id="SpanResolver",tag="experimental"}

 > #### Example Config
 >
--- a/website/docs/api/attributeruler.mdx
+++ b/website/docs/api/attributeruler.mdx
@ -2,7 +2,7 @@
 title: AttributeRuler
 tag: class
 source: spacy/pipeline/attribute_ruler.py
-new: 3
+version: 3
 teaser: 'Pipeline component for rule-based token attribute assignment'
 api_string_name: attribute_ruler
 api_trainable: false
@ -15,7 +15,7 @@ between attributes such as mapping fine-grained POS tags to coarse-grained POS
 tags. See the [usage guide](/usage/linguistic-features/#mappings-exceptions) for
 examples.

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
@ -37,7 +37,7 @@ how the component should be configured. You can override its settings via the
 %%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py
 ```

-## AttributeRuler.\_\_init\_\_ {#init tag="method"}
+## AttributeRuler.\_\_init\_\_ {id="init",tag="method"}

 Initialize the attribute ruler.

@ -56,7 +56,7 @@ Initialize the attribute ruler.
 | `validate`     | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~                                                                                                                                                                         |
 | `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag`", `"pos"`, `"morph"` and `"lemma"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |

-## AttributeRuler.\_\_call\_\_ {#call tag="method"}
+## AttributeRuler.\_\_call\_\_ {id="call",tag="method"}

 Apply the attribute ruler to a `Doc`, setting token attributes for tokens
 matched by the provided patterns.
@ -66,7 +66,7 @@ matched by the provided patterns.
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |

-## AttributeRuler.add {#add tag="method"}
+## AttributeRuler.add {id="add",tag="method"}

 Add patterns to the attribute ruler. The patterns are a list of `Matcher`
 patterns and the attributes are a dict of attributes to set on the matched
@ -89,7 +89,7 @@ may be negative to index from the end of the span.
 | `attrs`    | The attributes to assign to the target token in the matched span. ~~Dict[str, Any]~~                                              |
 | `index`    | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to `0`. ~~int~~ |

-## AttributeRuler.add_patterns {#add_patterns tag="method"}
+## AttributeRuler.add_patterns {id="add_patterns",tag="method"}

 > #### Example
 >
@ -116,7 +116,7 @@ keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of
 | ---------- | -------------------------------------------------------------------------- |
 | `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |

-## AttributeRuler.patterns {#patterns tag="property"}
+## AttributeRuler.patterns {id="patterns",tag="property"}

 Get all patterns that have been added to the attribute ruler in the
 `patterns_dict` format accepted by
@ -126,7 +126,7 @@ Get all patterns that have been added to the attribute ruler in the
 | ----------- | -------------------------------------------------------------------------------------------- |
 | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |

-## AttributeRuler.initialize {#initialize tag="method"}
+## AttributeRuler.initialize {id="initialize",tag="method"}

 Initialize the component with data and used before training to load in rules
 from a file. This method is typically called by
@ -160,7 +160,7 @@ config.
 | `tag_map`      | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~                                                                 |
 | `morph_rules`  | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~                            |

-## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
+## AttributeRuler.load_from_tag_map {id="load_from_tag_map",tag="method"}

 Load attribute ruler patterns from a tag map.

@ -168,7 +168,7 @@ Load attribute ruler patterns from a tag map.
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. ~~Dict[str, Dict[Union[int, str], Union[int, str]]]~~ |

-## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"}
+## AttributeRuler.load_from_morph_rules {id="load_from_morph_rules",tag="method"}

 Load attribute ruler patterns from morph rules.

@ -176,7 +176,7 @@ Load attribute ruler patterns from morph rules.
 | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |

-## AttributeRuler.to_disk {#to_disk tag="method"}
+## AttributeRuler.to_disk {id="to_disk",tag="method"}

 Serialize the pipe to disk.

@ -193,7 +193,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## AttributeRuler.from_disk {#from_disk tag="method"}
+## AttributeRuler.from_disk {id="from_disk",tag="method"}

 Load the pipe from disk. Modifies the object in place and returns it.

@ -211,7 +211,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `AttributeRuler` object. ~~AttributeRuler~~                                        |

-## AttributeRuler.to_bytes {#to_bytes tag="method"}
+## AttributeRuler.to_bytes {id="to_bytes",tag="method"}

 > #### Example
 >
@ -228,7 +228,7 @@ Serialize the pipe to a bytestring.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `AttributeRuler` object. ~~bytes~~                               |

-## AttributeRuler.from_bytes {#from_bytes tag="method"}
+## AttributeRuler.from_bytes {id="from_bytes",tag="method"}

 Load the pipe from a bytestring. Modifies the object in place and returns it.

@ -247,7 +247,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `AttributeRuler` object. ~~AttributeRuler~~                                             |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/website/docs/api/attributes.mdx
+++ b/website/docs/api/attributes.mdx
@ -41,10 +41,9 @@ from string attribute names to internal attribute IDs is stored in

 The corresponding [`Token` object attributes](/api/token#attributes) can be
 accessed using the same names in lowercase, e.g. `token.orth` or `token.length`.
-For attributes that represent string values, the internal integer ID is
-accessed as `Token.attr`, e.g. `token.dep`, while the string value can be
-retrieved by appending `_` as in `token.dep_`.
-
+For attributes that represent string values, the internal integer ID is accessed
+as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
+appending `_` as in `token.dep_`.

 | Attribute    | Description                                                                                                                                                   |
 | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -26,7 +26,7 @@ a list of available commands, you can type `python -m spacy --help`. You can
 also add the `--help` flag to any command or subcommand to see the description,
 available arguments and usage.

-## download {#download tag="command"}
+## download {id="download",tag="command"}

 Download [trained pipelines](/usage/models) for spaCy. The downloader finds the
 best-matching compatible version and uses `pip install` to download the Python
@ -44,7 +44,7 @@ pipeline name to be specified with its version (e.g. `en_core_web_sm-3.0.0`).
 > will also allow you to add it as a versioned package dependency to your
 > project.

-```cli
+```bash
 $ python -m spacy download [model] [--direct] [--sdist] [pip_args]
 ```

@ -57,24 +57,24 @@ $ python -m spacy download [model] [--direct] [--sdist] [pip_args]
 | pip args                                   | Additional installation options to be passed to `pip install` when installing the pipeline package. For example, `--user` to install to the user home directory or `--no-deps` to not install package dependencies. ~~Any (option/flag)~~ |
 | **CREATES**                                | The installed pipeline package in your `site-packages` directory.                                                                                                                                                                         |

-## info {#info tag="command"}
+## info {id="info",tag="command"}

 Print information about your spaCy installation, trained pipelines and local
 setup, and generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted
 markup to copy-paste into
 [GitHub issues](https://github.com/explosion/spaCy/issues).

-```cli
+```bash
 $ python -m spacy info [--markdown] [--silent] [--exclude]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy info en_core_web_lg --markdown
 > ```

-```cli
+```bash
 $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
 ```

@ -88,7 +88,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
 | `--help`, `-h`                               | Show help message and available arguments. ~~bool (flag)~~                                                              |
 | **PRINTS**                                   | Information about your spaCy installation.                                                                              |

-## validate {#validate new="2" tag="command"}
+## validate {id="validate",version="2",tag="command"}

 Find all trained pipeline packages installed in the current environment and
 check whether they are compatible with the currently installed version of spaCy.
@ -103,7 +103,7 @@ compatible versions and command for updating are shown.
 > suite, to ensure all packages are up to date before proceeding. If
 > incompatible packages are found, it will return `1`.

-```cli
+```bash
 $ python -m spacy validate
 ```

@ -111,12 +111,12 @@ $ python -m spacy validate
 | ---------- | -------------------------------------------------------------------- |
 | **PRINTS** | Details about the compatibility of your installed pipeline packages. |

-## init {#init new="3"}
+## init {id="init",version="3"}

 The `spacy init` CLI includes helpful commands for initializing training config
 files and pipeline directories.

-### init config {#init-config new="3" tag="command"}
+### init config {id="init-config",version="3",tag="command"}

 Initialize and save a [`config.cfg` file](/usage/training#config) using the
 **recommended settings** for your use case. It works just like the
@ -128,11 +128,11 @@ customize those settings in your config file later.

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy init config config.cfg --lang en --pipeline ner,textcat --optimize accuracy
 > ```

-```cli
+```bash
 $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--gpu] [--pretraining] [--force]
 ```

@ -148,7 +148,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
 | **CREATES**            | The config file for training.                                                                                                                                                                                                                                                                                                      |

-### init fill-config {#init-fill-config new="3"}
+### init fill-config {id="init-fill-config",version="3"}

 Auto-fill a partial [.cfg file](/usage/training#config) with **all default
 values**, e.g. a config generated with the
@ -162,15 +162,15 @@ validation error with more details.

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy init fill-config base.cfg config.cfg --diff
 > ```
 >
 > #### Example diff
 >
-> ![Screenshot of visual diff in terminal](../images/cli_init_fill-config_diff.jpg)
+> ![Screenshot of visual diff in terminal](/images/cli_init_fill-config_diff.jpg)

-```cli
+```bash
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```

@ -184,7 +184,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |

-### init vectors {#init-vectors new="3" tag="command"}
+### init vectors {id="init-vectors",version="3",tag="command"}

 Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use
 with spaCy. Will export an `nlp` object that you can use in the
@ -199,7 +199,7 @@ This functionality was previously available as part of the command `init-model`.

 </Infobox>

-```cli
+```bash
 $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
 ```

@ -216,7 +216,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |

-### init labels {#init-labels new="3" tag="command"}
+### init labels {id="init-labels",version="3",tag="command"}

 Generate JSON files for the labels in the data. This helps speed up the training
 process, since spaCy won't have to preprocess the data to extract the labels.
@ -234,7 +234,7 @@ After generating the labels, you can provide them to components that accept a
 > path = "corpus/labels/ner.json
 > ```

-```cli
+```bash
 $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
 ```

@ -249,7 +249,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The label files.                                                                                                                                                                                                   |

-## convert {#convert tag="command"}
+## convert {id="convert",tag="command"}

 Convert files into spaCy's
 [binary training data format](/api/data-formats#binary-training), a serialized
@ -257,7 +257,7 @@ Convert files into spaCy's
 management functions. The converter can be specified on the command line, or
 chosen based on the file extension of the input file.

-```cli
+```bash
 $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--base] [--morphology] [--merge-subtokens] [--ner-map] [--lang]
 ```

@ -278,7 +278,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 | `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                |
 | **CREATES**               | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train).                                       |

-### Converters {#converters}
+### Converters {id="converters"}

 | ID              | Description                                                                                                                                                                                                                                                                                                                                                           |
 | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -288,12 +288,12 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 | `ner` / `conll` | NER with IOB/IOB2/BILUO tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the NER tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). |
 | `iob`           | NER with IOB/IOB2/BILUO tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data).                                                                                                                            |

-## debug {#debug new="3"}
+## debug {id="debug",version="3"}

 The `spacy debug` CLI includes helpful commands for debugging and profiling your
 configs, data and implementations.

-### debug config {#debug-config new="3" tag="command"}
+### debug config {id="debug-config",version="3",tag="command"}

 Debug a [`config.cfg` file](/usage/training#config) and show validation errors.
 The command will create all objects in the tree and validate them. Note that
@ -303,13 +303,13 @@ errors at once and some issues are only shown once previous errors have been
 fixed. To auto-fill a partial config and save the result, you can use the
 [`init fill-config`](/api/cli#init-fill-config) command.

-```cli
+```bash
 $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show-variables] [overrides]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy debug config config.cfg
 > ```

@ -333,7 +333,7 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-conf

 <Accordion title="Example output (valid config and all options)" spaced>

-```cli
+```bash
 $ python -m spacy debug config ./config.cfg --show-functions --show-variables
 ```

@ -361,7 +361,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@ -371,7 +371,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@ -453,7 +453,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 | overrides                | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                                     |
 | **PRINTS**               | Config validation errors, if available.                                                                                                                                                                                        |

-### debug data {#debug-data tag="command"}
+### debug data {id="debug-data",tag="command"}

 Analyze, debug and validate your training and development data. Get useful
 stats, and find problems like invalid entity annotations, cyclic dependencies,
@ -479,13 +479,13 @@ the token distributions. To learn more, you can check out Papay et al.'s work on

 </Infobox>

-```cli
+```bash
 $ python -m spacy debug data [config_path] [--code] [--ignore-warnings] [--verbose] [--no-format] [overrides]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy debug data ./config.cfg
 > ```

@ -639,7 +639,7 @@ will not be available.
 | overrides                  | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**                 | Debugging information.                                                                                                                                                                                             |

-### debug diff-config {#debug-diff tag="command"}
+### debug diff-config {id="debug-diff",tag="command"}

 Show a diff of a config file with respect to spaCy's defaults or another config
 file. If additional settings were used in the creation of the config file, then
@ -647,13 +647,13 @@ you must supply these as extra parameters to the command when comparing to the
 default settings. The generated diff can also be used when posting to the
 discussion forum to provide more information for the maintainers.

-```cli
+```bash
 $ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy debug diff-config ./config.cfg
 > ```

@ -696,7 +696,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100

 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@ -719,7 +719,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100

 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
@ -868,7 +868,7 @@ after_init = null
 | `markdown`, `-md`    | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                 |
 | **PRINTS**           | Diff between the two config files.                                                                                                                                                                                                                                                                        |

-### debug profile {#debug-profile tag="command"}
+### debug profile {id="debug-profile",tag="command"}

 Profile which functions take the most time in a spaCy pipeline. Input should be
 formatted as one JSON object per line with a key `"text"`. It can either be
@ -882,7 +882,7 @@ The `profile` command is now available as a subcommand of `spacy debug`.

 </Infobox>

-```cli
+```bash
 $ python -m spacy debug profile [model] [inputs] [--n-texts]
 ```

@ -894,12 +894,12 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                         |
 | **PRINTS**        | Profiling information for the pipeline.                                            |

-### debug model {#debug-model new="3" tag="command"}
+### debug model {id="debug-model",version="3",tag="command"}

 Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
 sample text and checking how it updates its internal weights and parameters.

-```cli
+```bash
 $ python -m spacy debug model [config_path] [component] [--layers] [--dimensions] [--parameters] [--gradients] [--attributes] [--print-step0] [--print-step1] [--print-step2] [--print-step3] [--gpu-id]
 ```

@ -910,7 +910,7 @@ model ("Step 0"), which helps us to understand the internal structure of the
 Neural Network, and to focus on specific layers that we want to inspect further
 (see next example).

-```cli
+```bash
 $ python -m spacy debug model ./config.cfg tagger -P0
 ```

@ -956,7 +956,7 @@ an all-zero matrix determined by the `nO` and `nI` dimensions. After a first
 training step (Step 2), this matrix has clearly updated its values through the
 training feedback loop.

-```cli
+```bash
 $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2
 ```

@ -1017,7 +1017,7 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**              | Debugging information.                                                                                                                                                                                             |

-## train {#train tag="command"}
+## train {id="train",tag="command"}

 Train a pipeline. Expects data in spaCy's
 [binary format](/api/data-formats#training) and a
@ -1043,11 +1043,11 @@ in the section `[paths]`.

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy train config.cfg --output ./output --paths.train ./train --paths.dev ./dev
 > ```

-```cli
+```bash
 $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
 ```

@ -1062,7 +1062,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          |

-### Calling the training function from Python {#train-function new="3.2"}
+### Calling the training function from Python {id="train-function",version="3.2"}

 The training CLI exposes a `train` helper function that lets you run the
 training just like `spacy train`. Usually it's easier to use the command line
@ -1085,7 +1085,7 @@ directly, but if you need to kick off training from code this is how to do it.
 | `use_gpu`      | Which GPU to use. Defaults to -1 for no GPU. ~~int~~                                                                          |
 | `overrides`    | Values to override config settings. ~~Dict[str, Any]~~                                                                        |

-## pretrain {#pretrain new="2.1" tag="command,experimental"}
+## pretrain {id="pretrain",version="2.1",tag="command,experimental"}

 Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
 components on raw text, using an approximate language-modeling objective.
@ -1113,11 +1113,11 @@ auto-generated by setting `--pretraining` on

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy pretrain config.cfg ./output_pretrain --paths.raw_text ./data.jsonl
 > ```

-```cli
+```bash
 $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```

@ -1133,7 +1133,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |

-## evaluate {#evaluate new="2" tag="command"}
+## evaluate {id="evaluate",version="2",tag="command"}

 Evaluate a trained pipeline. Expects a loadable spaCy pipeline (package name or
 path) and evaluation data in the
@ -1146,7 +1146,7 @@ skew. To render a sample of dependency parses in a HTML file using the
 [displaCy visualizations](/usage/visualizers), set as output directory as the
 `--displacy-path` argument.

-```cli
+```bash
 $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 ```

@ -1163,7 +1163,7 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |

-## apply {#apply new="3.5" tag="command"}
+## apply {id="apply", version="3.5", tag="command"}

 Applies a trained pipeline to data and stores the resulting annotated documents
 in a `DocBin`. The input can be a single file or a directory. The recognized
@ -1194,7 +1194,8 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |

-## find-threshold {#find-threshold new="3.5" tag="command"}
+
+## find-threshold {id="find-threshold",version="3.5",tag="command"}

 Runs prediction trials for a trained model with varying tresholds to maximize
 the specified metric. The search space for the threshold is traversed linearly
@ -1209,12 +1210,12 @@ be provided.

 > #### Examples
 >
-> ```cli
+> ```bash
 > # For textcat_multilabel:
 > $ python -m spacy find-threshold my_nlp data.spacy textcat_multilabel threshold cats_macro_f
 > ```
 >
-> ```cli
+> ```bash
 > # For spancat:
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
@ -1233,7 +1234,7 @@ be provided.
 | `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |

-## assemble {#assemble tag="command"}
+## assemble {id="assemble",tag="command"}

 Assemble a pipeline from a config file without additional training. Expects a
 [config file](/api/data-formats#config) with all settings and hyperparameters.
@ -1243,11 +1244,11 @@ config.

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy assemble config.cfg ./output
 > ```

-```cli
+```bash
 $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [overrides]
 ```

@ -1261,7 +1262,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~                            |
 | **CREATES**       | The final assembled pipeline.                                                                                                                                                                                 |

-## package {#package tag="command"}
+## package {id="package",tag="command"}

 Generate an installable [Python package](/usage/training#models-generating) from
 an existing pipeline data directory. All data files are copied over. If
@ -1287,13 +1288,13 @@ the sdist and wheel by setting `--build sdist,wheel`.

 </Infobox>

-```cli
+```bash
 $ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--create-meta] [--build] [--name] [--version] [--force]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy package /input /output
 > $ cd /output/en_pipeline-0.0.0
 > $ pip install dist/en_pipeline-0.0.0.tar.gz
@ -1313,13 +1314,13 @@ $ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--cre
 | `--help`, `-h`                               | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                       |
 | **CREATES**                                  | A Python package containing the spaCy pipeline.                                                                                                                                                                                                                                  |

-## project {#project new="3"}
+## project {id="project",version="3"}

 The `spacy project` CLI includes subcommands for working with
 [spaCy projects](/usage/projects), end-to-end workflows for building and
 deploying custom spaCy pipelines.

-### project clone {#project-clone tag="command"}
+### project clone {id="project-clone",tag="command"}

 Clone a project template from a Git repository. Calls into `git` under the hood
 and can use the sparse checkout feature if available, so you're only downloading
@ -1328,19 +1329,19 @@ what you need. By default, spaCy's
 can provide any other repo (public or private) that you have access to using the
 `--repo` option.

-```cli
+```bash
 $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy project clone pipelines/ner_wikiner
 > ```
 >
 > Clone from custom repo:
 >
-> ```cli
+> ```bash
 > $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
 > ```

@ -1354,7 +1355,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 | `--help`, `-h`   | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
 | **CREATES**      | The cloned [project directory](/usage/projects#project-files).                                                                                            |

-### project assets {#project-assets tag="command"}
+### project assets {id="project-assets",tag="command"}

 Fetch project assets like datasets and pretrained weights. Assets are defined in
 the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
@ -1365,13 +1366,13 @@ considered "private" and you have to take care of putting them into the
 destination directory yourself. If a local path is provided, the asset is copied
 into the current project.

-```cli
+```bash
 $ python -m spacy project assets [project_dir]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy project assets [--sparse]
 > ```

@ -1382,7 +1383,7 @@ $ python -m spacy project assets [project_dir]
 | `--help`, `-h`   | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
 | **CREATES**      | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |

-### project run {#project-run tag="command"}
+### project run {id="project-run",tag="command"}

 Run a named command or workflow defined in the
 [`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
@ -1391,13 +1392,13 @@ all commands in the workflow are run, in order. If commands define
 re-run if state has changed. For example, if the input dataset changes, a
 preprocessing command that depends on those files will be re-run.

-```cli
+```bash
 $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy project run train
 > ```

@ -1410,7 +1411,7 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
 | `--help`, `-h`  | Show help message and available arguments. ~~bool (flag)~~                              |
 | **EXECUTES**    | The command defined in the `project.yml`.                                               |

-### project push {#project-push tag="command"}
+### project push {id="project-push",tag="command"}

 Upload all available files or directories listed as in the `outputs` section of
 commands to a remote storage. Outputs are archived and compressed prior to
@ -1430,13 +1431,13 @@ remote storages, so you can use any protocol that `Pathy` supports, including
 filesystem, although you may need to install extra dependencies to use certain
 protocols.

-```cli
+```bash
 $ python -m spacy project push [remote] [project_dir]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy project push my_bucket
 > ```
 >
@ -1453,7 +1454,7 @@ $ python -m spacy project push [remote] [project_dir]
 | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                              |
 | **UPLOADS**    | All project outputs that exist and are not already stored in the remote.                |

-### project pull {#project-pull tag="command"}
+### project pull {id="project-pull",tag="command"}

 Download all files or directories listed as `outputs` for commands, unless they
 are not already present locally. When searching for files in the remote, `pull`
@ -1475,13 +1476,13 @@ remote storages, so you can use any protocol that `Pathy` supports, including
 filesystem, although you may need to install extra dependencies to use certain
 protocols.

-```cli
+```bash
 $ python -m spacy project pull [remote] [project_dir]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy project pull my_bucket
 > ```
 >
@ -1498,7 +1499,7 @@ $ python -m spacy project pull [remote] [project_dir]
 | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                              |
 | **DOWNLOADS**  | All project outputs that do not exist locally and can be found in the remote.           |

-### project document {#project-document tag="command"}
+### project document {id="project-document",tag="command"}

 Auto-generate a pretty Markdown-formatted `README` for your project, based on
 its [`project.yml`](/usage/projects#project-yml). Will create sections that
@ -1507,13 +1508,13 @@ content will be placed between two hidden markers, so you can add your own
 custom content before or after the auto-generated documentation. When you re-run
 the `project document` command, only the auto-generated part is replaced.

-```cli
+```bash
 $ python -m spacy project document [project_dir] [--output] [--no-emoji]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy project document --output README.md
 > ```

@ -1522,7 +1523,7 @@ $ python -m spacy project document [project_dir] [--output] [--no-emoji]
 For more examples, see the templates in our
 [`projects`](https://github.com/explosion/projects) repo.

-![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg)
+![Screenshot of auto-generated Markdown Readme](/images/project_document.jpg)

 </Accordion>

@ -1533,7 +1534,7 @@ For more examples, see the templates in our
 | `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~                                                                                                                                                          |
 | **CREATES**         | The Markdown-formatted project documentation.                                                                                                                                                           |

-### project dvc {#project-dvc tag="command"}
+### project dvc {id="project-dvc",tag="command"}

 Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
 [`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
@ -1553,13 +1554,13 @@ You'll also need to add the assets you want to track with

 </Infobox>

-```cli
+```bash
 $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ git init
 > $ dvc init
 > $ python -m spacy project dvc all
@ -1575,14 +1576,14 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |

-## huggingface-hub {#huggingface-hub new="3.1"}
+## huggingface-hub {id="huggingface-hub",version="3.1"}

 The `spacy huggingface-cli` CLI includes commands for uploading your trained
 spaCy pipelines to the [Hugging Face Hub](https://huggingface.co/).

 > #### Installation
 >
-> ```cli
+> ```bash
 > $ pip install spacy-huggingface-hub
 > $ huggingface-cli login
 > ```
@ -1596,19 +1597,19 @@ package installed. Installing the package will automatically add the

 </Infobox>

-### huggingface-hub push {#huggingface-hub-push tag="command"}
+### huggingface-hub push {id="huggingface-hub-push",tag="command"}

 Push a spaCy pipeline to the Hugging Face Hub. Expects a `.whl` file packaged
 with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
 see the spaCy project [integration](/usage/projects#huggingface_hub).

-```cli
+```bash
 $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
 ```

 > #### Example
 >
-> ```cli
+> ```bash
 > $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
 > ```

--- a/website/docs/api/coref.mdx
+++ b/website/docs/api/coref.mdx
@ -34,7 +34,7 @@ same thing. Clusters are represented as SpanGroups that start with a prefix
 A `CoreferenceResolver` component can be paired with a
 [`SpanResolver`](/api/span-resolver) to expand single tokens to spans.

-## Assigned Attributes {#assigned-attributes}
+## Assigned Attributes {id="assigned-attributes"}

 Predictions will be saved to `Doc.spans` as a [`SpanGroup`](/api/spangroup). The
 span key will be a prefix plus a serial number referring to the coreference
@ -47,7 +47,7 @@ parameter.
 | ------------------------------------------ | ------------------------------------------------------------------------------------------------------- |
 | `Doc.spans[prefix + "_" + cluster_number]` | One coreference cluster, represented as single-token spans. Cluster numbers start from 1. ~~SpanGroup~~ |

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
@ -73,7 +73,7 @@ details on the architectures and their arguments and hyperparameters.
 | `model`               | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Coref](/api/architectures#Coref). ~~Model~~ |
 | `span_cluster_prefix` | The prefix for the keys for clusters saved to `doc.spans`. Defaults to `coref_clusters`. ~~str~~                                         |

-## CoreferenceResolver.\_\_init\_\_ {#init tag="method"}
+## CoreferenceResolver.\_\_init\_\_ {id="init",tag="method"}

 > #### Example
 >
@ -102,7 +102,7 @@ shortcut for this and instantiate the component using its string name and
 | _keyword-only_        |                                                                                                     |
 | `span_cluster_prefix` | The prefix for the key for saving clusters of spans. ~~bool~~                                       |

-## CoreferenceResolver.\_\_call\_\_ {#call tag="method"}
+## CoreferenceResolver.\_\_call\_\_ {id="call",tag="method"}

 Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
@ -125,7 +125,7 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |

-## CoreferenceResolver.pipe {#pipe tag="method"}
+## CoreferenceResolver.pipe {id="pipe",tag="method"}

 Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
@ -148,7 +148,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/coref#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## CoreferenceResolver.initialize {#initialize tag="method"}
+## CoreferenceResolver.initialize {id="initialize",tag="method"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. **At least one example
@ -172,7 +172,7 @@ by [`Language.initialize`](/api/language#initialize).
 | _keyword-only_ |                                                                                                                                                                            |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                       |

-## CoreferenceResolver.predict {#predict tag="method"}
+## CoreferenceResolver.predict {id="predict",tag="method"}

 Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
 modifying them. Clusters are returned as a list of `MentionClusters`, one for
@ -192,7 +192,7 @@ to token indices.
 | `docs`      | The documents to predict. ~~Iterable[Doc]~~                                  |
 | **RETURNS** | The predicted coreference clusters for the `docs`. ~~List[MentionClusters]~~ |

-## CoreferenceResolver.set_annotations {#set_annotations tag="method"}
+## CoreferenceResolver.set_annotations {id="set_annotations",tag="method"}

 Modify a batch of documents, saving coreference clusters in `Doc.spans`.

@ -209,7 +209,7 @@ Modify a batch of documents, saving coreference clusters in `Doc.spans`.
 | `docs`     | The documents to modify. ~~Iterable[Doc]~~                                   |
 | `clusters` | The predicted coreference clusters for the `docs`. ~~List[MentionClusters]~~ |

-## CoreferenceResolver.update {#update tag="method"}
+## CoreferenceResolver.update {id="update",tag="method"}

 Learn from a batch of [`Example`](/api/example) objects. Delegates to
 [`predict`](/api/coref#predict).
@ -231,7 +231,7 @@ Learn from a batch of [`Example`](/api/example) objects. Delegates to
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |

-## CoreferenceResolver.create_optimizer {#create_optimizer tag="method"}
+## CoreferenceResolver.create_optimizer {id="create_optimizer",tag="method"}

 Create an optimizer for the pipeline component.

@ -246,7 +246,7 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |

-## CoreferenceResolver.use_params {#use_params tag="method, contextmanager"}
+## CoreferenceResolver.use_params {id="use_params",tag="method, contextmanager"}

 Modify the pipe's model, to use the given parameter values. At the end of the
 context, the original parameters are restored.
@ -263,7 +263,7 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |

-## CoreferenceResolver.to_disk {#to_disk tag="method"}
+## CoreferenceResolver.to_disk {id="to_disk",tag="method"}

 Serialize the pipe to disk.

@ -280,7 +280,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## CoreferenceResolver.from_disk {#from_disk tag="method"}
+## CoreferenceResolver.from_disk {id="from_disk",tag="method"}

 Load the pipe from disk. Modifies the object in place and returns it.

@ -298,7 +298,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `CoreferenceResolver` object. ~~CoreferenceResolver~~                              |

-## CoreferenceResolver.to_bytes {#to_bytes tag="method"}
+## CoreferenceResolver.to_bytes {id="to_bytes",tag="method"}

 > #### Example
 >
@ -315,7 +315,7 @@ Serialize the pipe to a bytestring, including the `KnowledgeBase`.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `CoreferenceResolver` object. ~~bytes~~                          |

-## CoreferenceResolver.from_bytes {#from_bytes tag="method"}
+## CoreferenceResolver.from_bytes {id="from_bytes",tag="method"}

 Load the pipe from a bytestring. Modifies the object in place and returns it.

@ -334,7 +334,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `CoreferenceResolver` object. ~~CoreferenceResolver~~                                   |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/website/docs/api/corpus.mdx
+++ b/website/docs/api/corpus.mdx
@ -3,7 +3,7 @@ title: Corpus
 teaser: An annotated corpus
 tag: class
 source: spacy/training/corpus.py
-new: 3
+version: 3
 ---

 This class manages annotated corpora and can be used for training and
@ -13,7 +13,7 @@ customize the data loading during training, you can register your own
 see the usage guide on [data utilities](/usage/training#data) for more details
 and examples.

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 `spacy.Corpus.v1` is a registered function that creates a `Corpus` of training
 or evaluation data. It takes the same arguments as the `Corpus` class and
@ -49,7 +49,7 @@ streaming.
 %%GITHUB_SPACY/spacy/training/corpus.py
 ```

-## Corpus.\_\_init\_\_ {#init tag="method"}
+## Corpus.\_\_init\_\_ {id="init",tag="method"}

 Create a `Corpus` for iterating [Example](/api/example) objects from a file or
 directory of [`.spacy` data files](/api/data-formats#binary-training). The
@ -81,7 +81,7 @@ train/test skew.
 | `augmenter`    | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~                                                           |
 | `shuffle`      | Whether to shuffle the examples. Defaults to `False`. ~~bool~~                                                                                      |

-## Corpus.\_\_call\_\_ {#call tag="method"}
+## Corpus.\_\_call\_\_ {id="call",tag="method"}

 Yield examples from the data.

@ -101,7 +101,7 @@ Yield examples from the data.
 | `nlp`      | The current `nlp` object. ~~Language~~ |
 | **YIELDS** | The examples. ~~Example~~              |

-## JsonlCorpus {#jsonlcorpus tag="class"}
+## JsonlCorpus {id="jsonlcorpus",tag="class"}

 Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
 formatted raw text files. Can be used to read the raw text corpus for language
@ -120,14 +120,13 @@ file.
 > srsly.write_jsonl("/path/to/text.jsonl", data)
 > ```

-```json
-### Example
+```json {title="Example"}
 {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
 {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
 {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
 ```

-### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}
+### JsonlCorpus.\_\_init\_\_ {id="jsonlcorpus",tag="method"}

 Initialize the reader.

@ -157,7 +156,7 @@ Initialize the reader.
 | `max_length`   | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`        | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |

-### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}
+### JsonlCorpus.\_\_call\_\_ {id="jsonlcorpus-call",tag="method"}

 Yield examples from the data.

--- a/website/docs/api/cython-classes.mdx
+++ b/website/docs/api/cython-classes.mdx
@ -9,7 +9,7 @@ menu:
  - ['StringStore', 'stringstore']
 ---

-## Doc {#doc tag="cdef class" source="spacy/tokens/doc.pxd"}
+## Doc {id="doc",tag="cdef class",source="spacy/tokens/doc.pxd"}

 The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc)
 structs.
@ -21,7 +21,7 @@ accessed from Python. For the Python documentation, see [`Doc`](/api/doc).

 </Infobox>

-### Attributes {#doc_attributes}
+### Attributes {id="doc_attributes"}

 | Name         | Description                                                                                              |
 | ------------ | -------------------------------------------------------------------------------------------------------- |
@ -31,7 +31,7 @@ accessed from Python. For the Python documentation, see [`Doc`](/api/doc).
 | `length`     | The number of tokens in the document. ~~int~~                                                            |
 | `max_length` | The underlying size of the `Doc.c` array. ~~int~~                                                        |

-### Doc.push_back {#doc_push_back tag="method"}
+### Doc.push_back {id="doc_push_back",tag="method"}

 Append a token to the `Doc`. The token can be provided as a
 [`LexemeC`](/api/cython-structs#lexemec) or
@ -55,7 +55,7 @@ Append a token to the `Doc`. The token can be provided as a
 | `lex_or_tok` | The word to append to the `Doc`. ~~LexemeOrToken~~ |
 | `has_space`  | Whether the word has trailing whitespace. ~~bint~~ |

-## Token {#token tag="cdef class" source="spacy/tokens/token.pxd"}
+## Token {id="token",tag="cdef class",source="spacy/tokens/token.pxd"}

 A Cython class providing access and methods for a
 [`TokenC`](/api/cython-structs#tokenc) struct. Note that the `Token` object does
@ -68,7 +68,7 @@ accessed from Python. For the Python documentation, see [`Token`](/api/token).

 </Infobox>

-### Attributes {#token_attributes}
+### Attributes {id="token_attributes"}

 | Name    | Description                                                                |
 | ------- | -------------------------------------------------------------------------- |
@ -77,7 +77,7 @@ accessed from Python. For the Python documentation, see [`Token`](/api/token).
 | `i`     | The offset of the token within the document. ~~int~~                       |
 | `doc`   | The parent document. ~~Doc~~                                               |

-### Token.cinit {#token_cinit tag="method"}
+### Token.cinit {id="token_cinit",tag="method"}

 Create a `Token` object from a `TokenC*` pointer.

@ -94,7 +94,7 @@ Create a `Token` object from a `TokenC*` pointer.
 | `offset` | The offset of the token within the document. ~~int~~                       |
 | `doc`    | The parent document. ~~int~~                                               |

-## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}
+## Span {id="span",tag="cdef class",source="spacy/tokens/span.pxd"}

 A Cython class providing access and methods for a slice of a `Doc` object.

@ -105,7 +105,7 @@ accessed from Python. For the Python documentation, see [`Span`](/api/span).

 </Infobox>

-### Attributes {#span_attributes}
+### Attributes {id="span_attributes"}

 | Name         | Description                                                                   |
 | ------------ | ----------------------------------------------------------------------------- |
@ -116,7 +116,7 @@ accessed from Python. For the Python documentation, see [`Span`](/api/span).
 | `end_char`   | The index of the last character of the span. ~~int~~                          |
 | `label`      | A label to attach to the span, e.g. for named entities. ~~attr_t (uint64_t)~~ |

-## Lexeme {#lexeme tag="cdef class" source="spacy/lexeme.pxd"}
+## Lexeme {id="lexeme",tag="cdef class",source="spacy/lexeme.pxd"}

 A Cython class providing access and methods for an entry in the vocabulary.

@ -127,7 +127,7 @@ accessed from Python. For the Python documentation, see [`Lexeme`](/api/lexeme).

 </Infobox>

-### Attributes {#lexeme_attributes}
+### Attributes {id="lexeme_attributes"}

 | Name    | Description                                                                   |
 | ------- | ----------------------------------------------------------------------------- |
@ -135,7 +135,7 @@ accessed from Python. For the Python documentation, see [`Lexeme`](/api/lexeme).
 | `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~                           |
 | `orth`  | ID of the verbatim text content. ~~attr_t (uint64_t)~~                        |

-## Vocab {#vocab tag="cdef class" source="spacy/vocab.pxd"}
+## Vocab {id="vocab",tag="cdef class",source="spacy/vocab.pxd"}

 A Cython class providing access and methods for a vocabulary and other data
 shared across a language.
@ -147,7 +147,7 @@ accessed from Python. For the Python documentation, see [`Vocab`](/api/vocab).

 </Infobox>

-### Attributes {#vocab_attributes}
+### Attributes {id="vocab_attributes"}

 | Name      | Description                                                                                                |
 | --------- | ---------------------------------------------------------------------------------------------------------- |
@ -155,7 +155,7 @@ accessed from Python. For the Python documentation, see [`Vocab`](/api/vocab).
 | `strings` | A `StringStore` that maps string to hash values and vice versa. ~~StringStore~~                            |
 | `length`  | The number of entries in the vocabulary. ~~int~~                                                           |

-### Vocab.get {#vocab_get tag="method"}
+### Vocab.get {id="vocab_get",tag="method"}

 Retrieve a [`LexemeC*`](/api/cython-structs#lexemec) pointer from the
 vocabulary.
@ -172,7 +172,7 @@ vocabulary.
 | `string`    | The string of the word to look up. ~~str~~                                                                 |
 | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |

-### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
+### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}

 Retrieve a [`LexemeC*`](/api/cython-structs#lexemec) pointer from the
 vocabulary.
@ -189,7 +189,7 @@ vocabulary.
 | `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~                                                     |
 | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |

-## StringStore {#stringstore tag="cdef class" source="spacy/strings.pxd"}
+## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}

 A lookup table to retrieve strings by 64-bit hashes.

@ -201,7 +201,7 @@ accessed from Python. For the Python documentation, see

 </Infobox>

-### Attributes {#stringstore_attributes}
+### Attributes {id="stringstore_attributes"}

 | Name   | Description                                                                                                      |
 | ------ | ---------------------------------------------------------------------------------------------------------------- |
--- a/website/docs/api/cython-structs.mdx
+++ b/website/docs/api/cython-structs.mdx
@ -7,7 +7,7 @@ menu:
  - ['LexemeC', 'lexemec']
 ---

-## TokenC {#tokenc tag="C struct" source="spacy/structs.pxd"}
+## TokenC {id="tokenc",tag="C struct",source="spacy/structs.pxd"}

 Cython data container for the `Token` object.

@ -39,7 +39,7 @@ Cython data container for the `Token` object.
 | `ent_type`   | Named entity type. ~~attr_t (uint64_t)~~                                                                                                                                                                                                                                                                                                    |
 | `ent_id`     | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~attr_t (uint64_t)~~                                                                                                                                                                                                 |

-### Token.get_struct_attr {#token_get_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"}
+### Token.get_struct_attr {id="token_get_struct_attr",tag="staticmethod, nogil",source="spacy/tokens/token.pxd"}

 Get the value of an attribute from the `TokenC` struct by attribute ID.

@ -58,7 +58,7 @@ Get the value of an attribute from the `TokenC` struct by attribute ID.
 | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
 | **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~                                                    |

-### Token.set_struct_attr {#token_set_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"}
+### Token.set_struct_attr {id="token_set_struct_attr",tag="staticmethod, nogil",source="spacy/tokens/token.pxd"}

 Set the value of an attribute of the `TokenC` struct by attribute ID.

@ -78,7 +78,7 @@ Set the value of an attribute of the `TokenC` struct by attribute ID.
 | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
 | `value`     | The value to set. ~~attr_t (uint64_t)~~                                                              |

-### token_by_start {#token_by_start tag="function" source="spacy/tokens/doc.pxd"}
+### token_by_start {id="token_by_start",tag="function",source="spacy/tokens/doc.pxd"}

 Find a token in a `TokenC*` array by the offset of its first character.

@ -100,7 +100,7 @@ Find a token in a `TokenC*` array by the offset of its first character.
 | `start_char` | The start index to search for. ~~int~~                            |
 | **RETURNS**  | The index of the token in the array or `-1` if not found. ~~int~~ |

-### token_by_end {#token_by_end tag="function" source="spacy/tokens/doc.pxd"}
+### token_by_end {id="token_by_end",tag="function",source="spacy/tokens/doc.pxd"}

 Find a token in a `TokenC*` array by the offset of its final character.

@ -122,7 +122,7 @@ Find a token in a `TokenC*` array by the offset of its final character.
 | `end_char`  | The end index to search for. ~~int~~                              |
 | **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ |

-### set_children_from_heads {#set_children_from_heads tag="function" source="spacy/tokens/doc.pxd"}
+### set_children_from_heads {id="set_children_from_heads",tag="function",source="spacy/tokens/doc.pxd"}

 Set attributes that allow lookup of syntactic children on a `TokenC*` array.
 This function must be called after making changes to the `TokenC.head`
@ -148,7 +148,7 @@ attribute, in order to make the parse tree navigation consistent.
 | `tokens` | A `TokenC*` array. ~~const TokenC\*~~      |
 | `length` | The number of tokens in the array. ~~int~~ |

-## LexemeC {#lexemec tag="C struct" source="spacy/structs.pxd"}
+## LexemeC {id="lexemec",tag="C struct",source="spacy/structs.pxd"}

 Struct holding information about a lexical type. `LexemeC` structs are usually
 owned by the `Vocab`, and accessed through a read-only pointer on the `TokenC`
@ -172,7 +172,7 @@ struct.
 | `prefix` | Length-N substring from the start of the lexeme. Defaults to `N=1`. ~~attr_t (uint64_t)~~                                                        |
 | `suffix` | Length-N substring from the end of the lexeme. Defaults to `N=3`. ~~attr_t (uint64_t)~~                                                          |

-### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
+### Lexeme.get_struct_attr {id="lexeme_get_struct_attr",tag="staticmethod, nogil",source="spacy/lexeme.pxd"}

 Get the value of an attribute from the `LexemeC` struct by attribute ID.

@ -192,7 +192,7 @@ Get the value of an attribute from the `LexemeC` struct by attribute ID.
 | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
 | **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~                                                    |

-### Lexeme.set_struct_attr {#lexeme_set_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
+### Lexeme.set_struct_attr {id="lexeme_set_struct_attr",tag="staticmethod, nogil",source="spacy/lexeme.pxd"}

 Set the value of an attribute of the `LexemeC` struct by attribute ID.

@ -212,7 +212,7 @@ Set the value of an attribute of the `LexemeC` struct by attribute ID.
 | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
 | `value`     | The value to set. ~~attr_t (uint64_t)~~                                                              |

-### Lexeme.c_check_flag {#lexeme_c_check_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
+### Lexeme.c_check_flag {id="lexeme_c_check_flag",tag="staticmethod, nogil",source="spacy/lexeme.pxd"}

 Check the value of a binary flag attribute.

@ -232,7 +232,7 @@ Check the value of a binary flag attribute.
 | `flag_id`   | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
 | **RETURNS** | The boolean value of the flag. ~~bint~~                                                       |

-### Lexeme.c_set_flag {#lexeme_c_set_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
+### Lexeme.c_set_flag {id="lexeme_c_set_flag",tag="staticmethod, nogil",source="spacy/lexeme.pxd"}

 Set the value of a binary flag attribute.

--- a/website/docs/api/cython.mdx
+++ b/website/docs/api/cython.mdx
@ -6,7 +6,7 @@ menu:
  - ['Conventions', 'conventions']
 ---

-## Overview {#overview hidden="true"}
+## Overview {id="overview",hidden="true"}

 > #### What's Cython?
 >
@ -37,7 +37,7 @@ class holds a [`LexemeC`](/api/cython-structs#lexemec) struct, at `Lexeme.c`.
 This lets you shed the Python container, and pass a pointer to the underlying
 data into C-level functions.

-## Conventions {#conventions}
+## Conventions {id="conventions"}

 spaCy's core data structures are implemented as [Cython](http://cython.org/)
 `cdef` classes. Memory is managed through the
--- a/website/docs/api/data-formats.mdx
+++ b/website/docs/api/data-formats.mdx
@ -14,7 +14,7 @@ vocabulary data. For an overview of label schemes used by the models, see the
 [models directory](/models). Each trained pipeline documents the label schemes
 used in its components, depending on the data it was trained on.

-## Training config {#config new="3"}
+## Training config {id="config",version="3"}

 Config files define the training process and pipeline and can be passed to
 [`spacy train`](/api/cli#train). They use
@ -52,7 +52,7 @@ your config and check that it's valid, you can run the

 </Infobox>

-### nlp {#config-nlp tag="section"}
+### nlp {id="config-nlp",tag="section"}

 > #### Example
 >
@ -83,7 +83,7 @@ Defines the `nlp` object, its tokenizer and
 | `tokenizer`               | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~                                                                                                                                                                                                               |
 | `batch_size`              | Default batch size for [`Language.pipe`](/api/language#pipe) and [`Language.evaluate`](/api/language#evaluate). ~~int~~                                                                                                                                                                                 |

-### components {#config-components tag="section"}
+### components {id="config-components",tag="section"}

 > #### Example
 >
@ -106,7 +106,7 @@ function to use to create component) or a `source` (name of path of trained
 pipeline to copy components from). See the docs on
 [defining pipeline components](/usage/training#config-components) for details.

-### paths, system {#config-variables tag="variables"}
+### paths, system {id="config-variables",tag="variables"}

 These sections define variables that can be referenced across the other sections
 as variables. For example `${paths.train}` uses the value of `train` defined in
@ -116,11 +116,11 @@ need paths, you can define them here. All config values can also be
 [`spacy train`](/api/cli#train), which is especially relevant for data paths
 that you don't want to hard-code in your config file.

-```cli
+```bash
 $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 ```

-### corpora {#config-corpora tag="section"}
+### corpora {id="config-corpora",tag="section"}

 > #### Example
 >
@ -176,7 +176,7 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `corpora` | A dictionary keyed by string names, mapped to corpus functions that receive the current `nlp` object and return an iterator of [`Example`](/api/example) objects. ~~Dict[str, Callable[[Language], Iterator[Example]]]~~ |

-### training {#config-training tag="section"}
+### training {id="config-training",tag="section"}

 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `accumulate_gradient`                                | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | `batcher`                                            | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | `before_to_disk`                                     | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `before_update`                                      | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~                                         |
+| `before_update` <Tag variant="new">3.5</Tag>         | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~                                         |
 | `dev_corpus`                                         | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | `dropout`                                            | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | `eval_frequency`                                     | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
@ -202,7 +202,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `seed`                                               | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
 | `train_corpus`                                       | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |

-### pretraining {#config-pretraining tag="section,optional"}
+### pretraining {id="config-pretraining",tag="section,optional"}

 This section is optional and defines settings and controls for
 [language model pretraining](/usage/embeddings-transformers#pretraining). It's
@ -220,7 +220,7 @@ used when you run [`spacy pretrain`](/api/cli#pretrain).
 | `component`    | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~                                                                                                            |
 | `layer`        | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~                                                                                                                 |

-### initialize {#config-initialize tag="section"}
+### initialize {id="config-initialize",tag="section"}

 This config block lets you define resources for **initializing the pipeline**.
 It's used by [`Language.initialize`](/api/language#initialize) and typically
@ -255,9 +255,9 @@ Also see the usage guides on the
 | `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                           |
 | `vocab_data`   | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~                                                                                                                                                                                                                                                                                           |

-## Training data {#training}
+## Training data {id="training"}

-### Binary training format {#binary-training new="3"}
+### Binary training format {id="binary-training",version="3"}

 > #### Example
 >
@ -288,7 +288,7 @@ Note that while this is the format used to save training data, you do not have
 to understand the internal details to use it or create training data. See the
 section on [preparing training data](/usage/training#training-data).

-### JSON training format {#json-input tag="deprecated"}
+### JSON training format {id="json-input",tag="deprecated"}

 <Infobox variant="warning" title="Changed in v3.0">

@ -300,7 +300,7 @@ objects to JSON, you can now serialize them directly using the
 [`spacy convert`](/api/cli) lets you convert your JSON data to the new `.spacy`
 format:

-```cli
+```bash
 $ python -m spacy convert ./data.json .
 ```

@ -317,8 +317,7 @@ $ python -m spacy convert ./data.json .
 > [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
 > help you convert entity offsets to the right format.

-```python
-### Example structure
+```python {title="Example structure"}
 [{
    "id": int,                      # ID of the document within the corpus
    "paragraphs": [{                # list of paragraphs in the corpus
@ -357,7 +356,7 @@ https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/training-data.j

 </Accordion>

-### Annotation format for creating training examples {#dict-input}
+### Annotation format for creating training examples {id="dict-input"}

 An [`Example`](/api/example) object holds the information for one training
 instance. It stores two [`Doc`](/api/doc) objects: one for holding the
@ -436,8 +435,7 @@ file to keep track of your settings and hyperparameters and your own

 </Infobox>

-```python
-### Examples
+```python {title="Examples"}
 # Training data for a part-of-speech tagger
 doc = Doc(vocab, words=["I", "like", "stuff"])
 gold_dict = {"tags": ["NOUN", "VERB", "NOUN"]}
@ -466,7 +464,7 @@ gold_dict = {"entities": [(0, 12, "PERSON")],
 example = Example.from_dict(doc, gold_dict)
 ```

-## Lexical data for vocabulary {#vocab-jsonl new="2"}
+## Lexical data for vocabulary {id="vocab-jsonl",version="2"}

 This data file can be provided via the `vocab_data` setting in the
 `[initialize]` block of the training config to pre-define the lexical data to
@ -483,13 +481,11 @@ spaCy's [`Lexeme`](/api/lexeme#attributes) object.
 > vocab_data = "/path/to/vocab-data.jsonl"
 > ```

-```python
-### First line
+```python {title="First line"}
 {"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
 ```

-```python
-### Entry structure
+```python {title="Entry structure"}
 {
    "orth": string,     # the word text
    "id": int,          # can correspond to row in vectors table
@ -526,7 +522,7 @@ Here's an example of the 20 most frequent lexemes in the English training data:
 %%GITHUB_SPACY/extra/example_data/vocab-data.jsonl
 ```

-## Pipeline meta {#meta}
+## Pipeline meta {id="meta"}

 The pipeline meta is available as the file `meta.json` and exported
 automatically when you save an `nlp` object to disk. Its contents are available
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@ -2,7 +2,7 @@
 title: DependencyMatcher
 teaser: Match subtrees within a dependency parse
 tag: class
-new: 3
+version: 3
 source: spacy/matcher/dependencymatcher.pyx
 ---

@ -14,7 +14,7 @@ It requires a pretrained [`DependencyParser`](/api/parser) or other component
 that sets the `Token.dep` and `Token.head` attributes. See the
 [usage guide](/usage/rule-based-matching#dependencymatcher) for examples.

-## Pattern format {#patterns}
+## Pattern format {id="patterns"}

 > ```python
 > ### Example
@ -62,7 +62,7 @@ of relations, see the usage guide on

 </Infobox>

-### Operators {#operators}
+### Operators {id="operators"}

 The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
@ -87,8 +87,7 @@ come directly from
 | `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |

-
-## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
+## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}

 Create a `DependencyMatcher`.

@ -105,7 +104,7 @@ Create a `DependencyMatcher`.
 | _keyword-only_ |                                                                                                       |
 | `validate`     | Validate all patterns added to this matcher. ~~bool~~                                                 |

-## DependencyMatcher.\_\call\_\_ {#call tag="method"}
+## DependencyMatcher.\_\_call\_\_ {id="call",tag="method"}

 Find all tokens matching the supplied patterns on the `Doc` or `Span`.

@ -127,7 +126,7 @@ Find all tokens matching the supplied patterns on the `Doc` or `Span`.
 | `doclike`   | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                                               |
 | **RETURNS** | A list of `(match_id, token_ids)` tuples, describing the matches. The `match_id` is the ID of the match pattern and `token_ids` is a list of token indices matched by the pattern, where the position of each token in the list corresponds to the position of the node specification in the pattern. ~~List[Tuple[int, List[int]]]~~ |

-## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
+## DependencyMatcher.\_\_len\_\_ {id="len",tag="method"}

 Get the number of rules added to the dependency matcher. Note that this only
 returns the number of rules (identical with the number of IDs), not the number
@ -148,7 +147,7 @@ of individual patterns.
 | ----------- | ---------------------------- |
 | **RETURNS** | The number of rules. ~~int~~ |

-## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
+## DependencyMatcher.\_\_contains\_\_ {id="contains",tag="method"}

 Check whether the matcher contains rules for a match ID.

@ -166,7 +165,7 @@ Check whether the matcher contains rules for a match ID.
 | `key`       | The match ID. ~~str~~                                          |
 | **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |

-## DependencyMatcher.add {#add tag="method"}
+## DependencyMatcher.add {id="add",tag="method"}

 Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
 optional callback function to act on the matches. The callback function will
@ -191,7 +190,7 @@ will be overwritten.
 | _keyword-only_ |                                                                                                                                                                      |
 | `on_match`     | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[DependencyMatcher, Doc, int, List[Tuple], Any]]~~ |

-## DependencyMatcher.get {#get tag="method"}
+## DependencyMatcher.get {id="get",tag="method"}

 Retrieve the pattern stored for a key. Returns the rule as an
 `(on_match, patterns)` tuple containing the callback and available patterns.
@ -208,7 +207,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
 | `key`       | The ID of the match rule. ~~str~~                                                                           |
 | **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[Union[Dict, Tuple]]]]~~ |

-## DependencyMatcher.remove {#remove tag="method"}
+## DependencyMatcher.remove {id="remove",tag="method"}

 Remove a rule from the dependency matcher. A `KeyError` is raised if the match
 ID does not exist.
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@ -25,7 +25,7 @@ current state. The weights are updated such that the scores assigned to the set
 of optimal actions is increased, while scores assigned to other actions are
 decreased. Note that more than one action may be optimal for a given state.

-## Assigned Attributes {#assigned-attributes}
+## Assigned Attributes {id="assigned-attributes"}

 Dependency predictions are assigned to the `Token.dep` and `Token.head` fields.
 Beside the dependencies themselves, the parser decides sentence boundaries,
@ -39,7 +39,7 @@ which are saved in `Token.is_sent_start` and accessible via `Doc.sents`.
 | `Token.is_sent_start` | A boolean value indicating whether the token starts a sentence. After the parser runs this will be `True` or `False` for all tokens. ~~bool~~ |
 | `Doc.sents`           | An iterator over sentences in the `Doc`, determined by `Token.is_sent_start` values. ~~Iterator[Span]~~                                       |

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
@ -74,7 +74,7 @@ architectures and their arguments and hyperparameters.
 %%GITHUB_SPACY/spacy/pipeline/dep_parser.pyx
 ```

-## DependencyParser.\_\_init\_\_ {#init tag="method"}
+## DependencyParser.\_\_init\_\_ {id="init",tag="method"}

 > #### Example
 >
@ -107,7 +107,7 @@ shortcut for this and instantiate the component using its string name and
 | `min_action_freq`             | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
 | `scorer`                      | The scoring method. Defaults to [`Scorer.score_deps`](/api/scorer#score_deps) for the attribute `"dep"` ignoring the labels `p` and `punct` and [`Scorer.score_spans`](/api/scorer/#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                                |

-## DependencyParser.\_\_call\_\_ {#call tag="method"}
+## DependencyParser.\_\_call\_\_ {id="call",tag="method"}

 Apply the pipe to one document. The document is modified in place, and returned.
 This usually happens under the hood when the `nlp` object is called on a text
@ -131,7 +131,40 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |

-## DependencyParser.pipe {#pipe tag="method"}
+## DependencyParser.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("parser")
+> student_pipe = student.add_pipe("parser")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
+## DependencyParser.pipe {id="pipe",tag="method"}

 Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
@ -155,7 +188,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## DependencyParser.initialize {#initialize tag="method" new="3"}
+## DependencyParser.initialize {id="initialize",tag="method",version="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. **At least one example
@ -192,7 +225,7 @@ config.
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                                   |
 | `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |

-## DependencyParser.predict {#predict tag="method"}
+## DependencyParser.predict {id="predict",tag="method"}

 Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
 modifying them.
@ -209,7 +242,7 @@ modifying them.
 | `docs`      | The documents to predict. ~~Iterable[Doc]~~                   |
 | **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ |

-## DependencyParser.set_annotations {#set_annotations tag="method"}
+## DependencyParser.set_annotations {id="set_annotations",tag="method"}

 Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

@ -226,7 +259,7 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
 | `docs`   | The documents to modify. ~~Iterable[Doc]~~                                                                                            |
 | `scores` | The scores to set, produced by `DependencyParser.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ |

-## DependencyParser.update {#update tag="method"}
+## DependencyParser.update {id="update",tag="method"}

 Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
 model. Delegates to [`predict`](/api/dependencyparser#predict) and
@ -249,7 +282,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |

-## DependencyParser.get_loss {#get_loss tag="method"}
+## DependencyParser.get_loss {id="get_loss",tag="method"}

 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
@ -268,7 +301,28 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |

-## DependencyParser.create_optimizer {#create_optimizer tag="method"}
+## DependencyParser.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_parser = teacher.get_pipe("parser")
+> student_parser = student.add_pipe("parser")
+> student_scores = student_parser.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_parser.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_parser.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
+## DependencyParser.create_optimizer {id="create_optimizer",tag="method"}

 Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
 component.
@ -284,7 +338,7 @@ component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |

-## DependencyParser.use_params {#use_params tag="method, contextmanager"}
+## DependencyParser.use_params {id="use_params",tag="method, contextmanager"}

 Modify the pipe's model, to use the given parameter values. At the end of the
 context, the original parameters are restored.
@ -301,7 +355,7 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |

-## DependencyParser.add_label {#add_label tag="method"}
+## DependencyParser.add_label {id="add_label",tag="method"}

 Add a new label to the pipe. Note that you don't have to call this method if you
 provide a **representative data sample** to the [`initialize`](#initialize)
@ -321,7 +375,7 @@ to the model, and the output dimension will be
 | `label`     | The label to add. ~~str~~                                   |
 | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |

-## DependencyParser.set_output {#set_output tag="method"}
+## DependencyParser.set_output {id="set_output",tag="method"}

 Change the output dimension of the component's model by calling the model's
 attribute `resize_output`. This is a function that takes the original model and
@ -340,7 +394,7 @@ forgetting" problem.
 | ---- | --------------------------------- |
 | `nO` | The new output dimension. ~~int~~ |

-## DependencyParser.to_disk {#to_disk tag="method"}
+## DependencyParser.to_disk {id="to_disk",tag="method"}

 Serialize the pipe to disk.

@ -357,7 +411,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## DependencyParser.from_disk {#from_disk tag="method"}
+## DependencyParser.from_disk {id="from_disk",tag="method"}

 Load the pipe from disk. Modifies the object in place and returns it.

@ -375,7 +429,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `DependencyParser` object. ~~DependencyParser~~                                    |

-## DependencyParser.to_bytes {#to_bytes tag="method"}
+## DependencyParser.to_bytes {id="to_bytes",tag="method"}

 > #### Example
 >
@ -392,7 +446,7 @@ Serialize the pipe to a bytestring.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `DependencyParser` object. ~~bytes~~                             |

-## DependencyParser.from_bytes {#from_bytes tag="method"}
+## DependencyParser.from_bytes {id="from_bytes",tag="method"}

 Load the pipe from a bytestring. Modifies the object in place and returns it.

@ -411,7 +465,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `DependencyParser` object. ~~DependencyParser~~                                         |

-## DependencyParser.labels {#labels tag="property"}
+## DependencyParser.labels {id="labels",tag="property"}

 The labels currently added to the component.

@ -426,7 +480,7 @@ The labels currently added to the component.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |

-## DependencyParser.label_data {#label_data tag="property" new="3"}
+## DependencyParser.label_data {id="label_data",tag="property",version="3"}

 The labels currently added to the component and their internal meta information.
 This is the data generated by [`init labels`](/api/cli#init-labels) and used by
@ -444,7 +498,7 @@ the model with a pre-defined label set.
 | ----------- | ------------------------------------------------------------------------------- |
 | **RETURNS** | The label data added to the component. ~~Dict[str, Dict[str, Dict[str, int]]]~~ |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -12,7 +12,7 @@ compressed binary strings. The `Doc` object holds an array of
 [`Span`](/api/span) objects are views of this array, i.e. they don't own the
 data themselves.

-## Doc.\_\_init\_\_ {#init tag="method"}
+## Doc.\_\_init\_\_ {id="init",tag="method"}

 Construct a `Doc` object. The most common way to get a `Doc` object is via the
 `nlp` object.
@ -47,7 +47,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ |
 | `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                        |

-## Doc.\_\_getitem\_\_ {#getitem tag="method"}
+## Doc.\_\_getitem\_\_ {id="getitem",tag="method"}

 Get a [`Token`](/api/token) object at position `i`, where `i` is an integer.
 Negative indexing is supported, and follows the usual Python semantics, i.e.
@ -80,7 +80,7 @@ semantics.
 | `start_end` | The slice of the document to get. ~~Tuple[int, int]~~ |
 | **RETURNS** | The span at `doc[start:end]`. ~~Span~~                |

-## Doc.\_\_iter\_\_ {#iter tag="method"}
+## Doc.\_\_iter\_\_ {id="iter",tag="method"}

 Iterate over `Token` objects, from which the annotations can be easily accessed.

@ -100,7 +100,7 @@ underlying C data directly from Cython.
 | ---------- | --------------------------- |
 | **YIELDS** | A `Token` object. ~~Token~~ |

-## Doc.\_\_len\_\_ {#len tag="method"}
+## Doc.\_\_len\_\_ {id="len",tag="method"}

 Get the number of tokens in the document.

@ -115,7 +115,7 @@ Get the number of tokens in the document.
 | ----------- | --------------------------------------------- |
 | **RETURNS** | The number of tokens in the document. ~~int~~ |

-## Doc.set_extension {#set_extension tag="classmethod" new="2"}
+## Doc.set_extension {id="set_extension",tag="classmethod",version="2"}

 Define a custom attribute on the `Doc` which becomes available via `Doc._`. For
 details, see the documentation on
@ -140,7 +140,7 @@ details, see the documentation on
 | `setter`  | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. ~~Optional[Callable[[Doc, Any], None]]~~ |
 | `force`   | Force overwriting existing attribute. ~~bool~~                                                                                                                               |

-## Doc.get_extension {#get_extension tag="classmethod" new="2"}
+## Doc.get_extension {id="get_extension",tag="classmethod",version="2"}

 Look up a previously registered extension by name. Returns a 4-tuple
 `(default, method, getter, setter)` if the extension is registered. Raises a
@ -160,7 +160,7 @@ Look up a previously registered extension by name. Returns a 4-tuple
 | `name`      | Name of the extension. ~~str~~                                                                                                                     |
 | **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |

-## Doc.has_extension {#has_extension tag="classmethod" new="2"}
+## Doc.has_extension {id="has_extension",tag="classmethod",version="2"}

 Check whether an extension has been registered on the `Doc` class.

@ -177,7 +177,7 @@ Check whether an extension has been registered on the `Doc` class.
 | `name`      | Name of the extension to check. ~~str~~             |
 | **RETURNS** | Whether the extension has been registered. ~~bool~~ |

-## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
+## Doc.remove_extension {id="remove_extension",tag="classmethod",version="2.0.12"}

 Remove a previously registered extension.

@ -195,7 +195,7 @@ Remove a previously registered extension.
 | `name`      | Name of the extension. ~~str~~                                                                                                                             |
 | **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |

-## Doc.char_span {#char_span tag="method" new="2"}
+## Doc.char_span {id="char_span",tag="method",version="2"}

 Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
 `None` if the character indices don't map to a valid span using the default
@ -219,7 +219,7 @@ alignment mode `"strict".
 | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | **RETURNS**      | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

-## Doc.set_ents {#set_ents tag="method" new="3"}
+## Doc.set_ents {id="set_ents",tag="method",version="3"}

 Set the named entities in the document.

@ -243,7 +243,7 @@ Set the named entities in the document.
 | `outside`      | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                                      |
 | `default`      | How to set entity annotation for tokens outside of any provided spans. Options: `"blocked"`, `"missing"`, `"outside"` and `"unmodified"` (preserve current state). Defaults to `"outside"`. ~~str~~ |

-## Doc.similarity {#similarity tag="method" model="vectors"}
+## Doc.similarity {id="similarity",tag="method",model="vectors"}

 Make a semantic similarity estimate. The default estimate is cosine similarity
 using an average of word vectors.
@ -263,7 +263,7 @@ using an average of word vectors.
 | `other`     | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
 | **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~                                                                     |

-## Doc.count_by {#count_by tag="method"}
+## Doc.count_by {id="count_by",tag="method"}

 Count the frequencies of a given attribute. Produces a dict of
 `{attr (int): count (ints)}` frequencies, keyed by the values of the given
@ -284,7 +284,7 @@ attribute ID.
 | `attr_id`   | The attribute ID. ~~int~~                                             |
 | **RETURNS** | A dictionary mapping attributes to integer counts. ~~Dict[int, int]~~ |

-## Doc.get_lca_matrix {#get_lca_matrix tag="method"}
+## Doc.get_lca_matrix {id="get_lca_matrix",tag="method"}

 Calculates the lowest common ancestor matrix for a given `Doc`. Returns LCA
 matrix containing the integer index of the ancestor, or `-1` if no common
@ -302,7 +302,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 | ----------- | -------------------------------------------------------------------------------------- |
 | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |

-## Doc.has_annotation {#has_annotation tag="method"}
+## Doc.has_annotation {id="has_annotation",tag="method"}

 Check whether the doc contains annotation on a
 [`Token` attribute](/api/token#attributes).
@ -327,7 +327,7 @@ doc = nlp("This is a text")
 | `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
 | **RETURNS**        | Whether specified annotation is present in the doc. ~~bool~~                                        |

-## Doc.to_array {#to_array tag="method"}
+## Doc.to_array {id="to_array",tag="method"}

 Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
 of `M` attributes, the output array will be of shape `(N, M)`, where `N` is the
@ -355,7 +355,7 @@ Returns a 2D array with one row per token and one column per attribute (when
 | `attr_ids`  | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ |
 | **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~            |

-## Doc.from_array {#from_array tag="method"}
+## Doc.from_array {id="from_array",tag="method"}

 Load attributes from a numpy array. Write to a `Doc` object, from an `(M, N)`
 array of attributes.
@ -379,7 +379,7 @@ array of attributes.
 | `exclude`   | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS** | The `Doc` itself. ~~Doc~~                                                                   |

-## Doc.from_docs {#from_docs tag="staticmethod" new="3"}
+## Doc.from_docs {id="from_docs",tag="staticmethod",version="3"}

 Concatenate multiple `Doc` objects to form a new one. Raises an error if the
 `Doc` objects do not all share the same `Vocab`.
@ -408,7 +408,7 @@ Concatenate multiple `Doc` objects to form a new one. Raises an error if the
 | `exclude` <Tag variant="new">3.3</Tag> | String names of Doc attributes to exclude. Supported: `spans`, `tensor`, `user_data`. ~~Iterable[str]~~           |
 | **RETURNS**                            | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ |

-## Doc.to_disk {#to_disk tag="method" new="2"}
+## Doc.to_disk {id="to_disk",tag="method",version="2"}

 Save the current state to a directory.

@ -424,7 +424,7 @@ Save the current state to a directory.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## Doc.from_disk {#from_disk tag="method" new="2"}
+## Doc.from_disk {id="from_disk",tag="method",version="2"}

 Loads state from a directory. Modifies the object in place and returns it.

@ -443,7 +443,7 @@ Loads state from a directory. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `Doc` object. ~~Doc~~                                                              |

-## Doc.to_bytes {#to_bytes tag="method"}
+## Doc.to_bytes {id="to_bytes",tag="method"}

 Serialize, i.e. export the document contents to a binary string.

@ -460,7 +460,7 @@ Serialize, i.e. export the document contents to a binary string.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | A losslessly serialized copy of the `Doc`, including all annotations. ~~bytes~~             |

-## Doc.from_bytes {#from_bytes tag="method"}
+## Doc.from_bytes {id="from_bytes",tag="method"}

 Deserialize, i.e. import the document contents from a binary string.

@ -481,7 +481,7 @@ Deserialize, i.e. import the document contents from a binary string.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `Doc` object. ~~Doc~~                                                                   |

-## Doc.to_json {#to_json tag="method"}
+## Doc.to_json {id="to_json",tag="method"}

 Serializes a document to JSON. Note that this is format differs from the
 deprecated [`JSON training format`](/api/data-formats#json-input).
@ -498,7 +498,7 @@ deprecated [`JSON training format`](/api/data-formats#json-input).
 | `underscore` | Optional list of string names of custom `Doc` attributes. Attribute values need to be JSON-serializable. Values will be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`. ~~Optional[List[str]]~~ |
 | **RETURNS**  | The data in JSON format. ~~Dict[str, Any]~~                                                                                                                                                                    |

-## Doc.from_json {#from_json tag="method" new="3.3.1"}
+## Doc.from_json {id="from_json",tag="method",version="3.3.1"}

 Deserializes a document from JSON, i.e. generates a document from the provided
 JSON data as generated by [`Doc.to_json()`](/api/doc#to_json).
@ -520,7 +520,7 @@ JSON data as generated by [`Doc.to_json()`](/api/doc#to_json).
 | `validate`     | Whether to validate the JSON input against the expected schema for detailed debugging. Defaults to `False`. ~~bool~~ |
 | **RETURNS**    | A `Doc` corresponding to the provided JSON. ~~Doc~~                                                                  |

-## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
+## Doc.retokenize {id="retokenize",tag="contextmanager",version="2.1"}

 Context manager to handle retokenization of the `Doc`. Modifications to the
 `Doc`'s tokenization are stored, and then made all at once when the context
@ -540,7 +540,7 @@ invalidated, although they may accidentally continue to work.
 | ----------- | -------------------------------- |
 | **RETURNS** | The retokenizer. ~~Retokenizer~~ |

-### Retokenizer.merge {#retokenizer.merge tag="method"}
+### Retokenizer.merge {id="retokenizer.merge",tag="method"}

 Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
@ -563,7 +563,7 @@ values.
 | `span`  | The span to merge. ~~Span~~                                           |
 | `attrs` | Attributes to set on the merged token. ~~Dict[Union[str, int], Any]~~ |

-### Retokenizer.split {#retokenizer.split tag="method"}
+### Retokenizer.split {id="retokenizer.split",tag="method"}

 Mark a token for splitting, into the specified `orths`. The `heads` are required
 to specify how the new subtokens should be integrated into the dependency tree.
@ -599,7 +599,7 @@ underlying lexeme (if they're context-independent lexical attributes like
 | `heads` | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. ~~List[Union[Token, Tuple[Token, int]]]~~ |
 | `attrs` | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. ~~Dict[Union[str, int], List[Any]]~~             |

-## Doc.ents {#ents tag="property" model="NER"}
+## Doc.ents {id="ents",tag="property",model="NER"}

 The named entities in the document. Returns a tuple of named entity `Span`
 objects, if the entity recognizer has been applied.
@ -617,7 +617,7 @@ objects, if the entity recognizer has been applied.
 | ----------- | ---------------------------------------------------------------- |
 | **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span]~~ |

-## Doc.spans {#spans tag="property"}
+## Doc.spans {id="spans",tag="property"}

 A dictionary of named span groups, to store and access additional span
 annotations. You can write to it by assigning a list of [`Span`](/api/span)
@ -634,7 +634,7 @@ objects or a [`SpanGroup`](/api/spangroup) to a given key.
 | ----------- | ------------------------------------------------------------------ |
 | **RETURNS** | The span groups assigned to the document. ~~Dict[str, SpanGroup]~~ |

-## Doc.cats {#cats tag="property" model="text classifier"}
+## Doc.cats {id="cats",tag="property",model="text classifier"}

 Maps a label to a score for categories applied to the document. Typically set by
 the [`TextCategorizer`](/api/textcategorizer).
@ -650,7 +650,7 @@ the [`TextCategorizer`](/api/textcategorizer).
 | ----------- | ---------------------------------------------------------- |
 | **RETURNS** | The text categories mapped to scores. ~~Dict[str, float]~~ |

-## Doc.noun_chunks {#noun_chunks tag="property" model="parser"}
+## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}

 Iterate over the base noun phrases in the document. Yields base noun-phrase
 `Span` objects, if the document has been syntactically parsed. A base noun
@ -677,7 +677,7 @@ implemented for the given language, a `NotImplementedError` is raised.
 | ---------- | ------------------------------------- |
 | **YIELDS** | Noun chunks in the document. ~~Span~~ |

-## Doc.sents {#sents tag="property" model="sentences"}
+## Doc.sents {id="sents",tag="property",model="sentences"}

 Iterate over the sentences in the document. Sentence spans have no label.

@ -699,7 +699,7 @@ will raise an error otherwise.
 | ---------- | ----------------------------------- |
 | **YIELDS** | Sentences in the document. ~~Span~~ |

-## Doc.has_vector {#has_vector tag="property" model="vectors"}
+## Doc.has_vector {id="has_vector",tag="property",model="vectors"}

 A boolean value indicating whether a word vector is associated with the object.

@ -714,7 +714,7 @@ A boolean value indicating whether a word vector is associated with the object.
 | ----------- | --------------------------------------------------------- |
 | **RETURNS** | Whether the document has a vector data attached. ~~bool~~ |

-## Doc.vector {#vector tag="property" model="vectors"}
+## Doc.vector {id="vector",tag="property",model="vectors"}

 A real-valued meaning representation. Defaults to an average of the token
 vectors.
@ -731,7 +731,7 @@ vectors.
 | ----------- | -------------------------------------------------------------------------------------------------- |
 | **RETURNS** | A 1-dimensional array representing the document's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |

-## Doc.vector_norm {#vector_norm tag="property" model="vectors"}
+## Doc.vector_norm {id="vector_norm",tag="property",model="vectors"}

 The L2 norm of the document's vector representation.

@ -749,7 +749,7 @@ The L2 norm of the document's vector representation.
 | ----------- | --------------------------------------------------- |
 | **RETURNS** | The L2 norm of the vector representation. ~~float~~ |

-## Attributes {#attributes}
+## Attributes {id="attributes"}

 | Name                                       | Description                                                                                                                                    |
 | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
@ -768,7 +768,7 @@ The L2 norm of the document's vector representation.
 | `_`                                        | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                  |
 | `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/website/docs/api/docbin.mdx
+++ b/website/docs/api/docbin.mdx
@ -1,7 +1,7 @@
 ---
 title: DocBin
 tag: class
-new: 2.2
+version: 2.2
 teaser: Pack Doc objects for binary serialization
 source: spacy/tokens/_serialize.py
 ---
@ -15,8 +15,7 @@ notable downside to this format is that you can't easily extract just one
 document from the `DocBin`. The serialization format is gzipped msgpack, where
 the msgpack object has the following structure:

-```python
-### msgpack object structure
+```python {title="msgpack object structure"}
 {
    "version": str,           # DocBin version number
    "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
@ -33,7 +32,7 @@ object. This means the storage is more efficient if you pack more documents
 together, because you have less duplication in the strings. For usage examples,
 see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).

-## DocBin.\_\_init\_\_ {#init tag="method"}
+## DocBin.\_\_init\_\_ {id="init",tag="method"}

 Create a `DocBin` object to hold serialized annotations.

@ -50,7 +49,7 @@ Create a `DocBin` object to hold serialized annotations.
 | `store_user_data` | Whether to write the `Doc.user_data` and the values of custom extension attributes to file/bytes. Defaults to `False`. ~~bool~~                                                                                                                                                                     |
 | `docs`            | `Doc` objects to add on initialization. ~~Iterable[Doc]~~                                                                                                                                                                                                                                           |

-## DocBin.\_\len\_\_ {#len tag="method"}
+## DocBin.\_\_len\_\_ {id="len",tag="method"}

 Get the number of `Doc` objects that were added to the `DocBin`.

@ -67,7 +66,7 @@ Get the number of `Doc` objects that were added to the `DocBin`.
 | ----------- | --------------------------------------------------- |
 | **RETURNS** | The number of `Doc`s added to the `DocBin`. ~~int~~ |

-## DocBin.add {#add tag="method"}
+## DocBin.add {id="add",tag="method"}

 Add a `Doc`'s annotations to the `DocBin` for serialization.

@ -83,7 +82,7 @@ Add a `Doc`'s annotations to the `DocBin` for serialization.
 | -------- | -------------------------------- |
 | `doc`    | The `Doc` object to add. ~~Doc~~ |

-## DocBin.get_docs {#get_docs tag="method"}
+## DocBin.get_docs {id="get_docs",tag="method"}

 Recover `Doc` objects from the annotations, using the given vocab.

@ -98,7 +97,7 @@ Recover `Doc` objects from the annotations, using the given vocab.
 | `vocab`    | The shared vocab. ~~Vocab~~ |
 | **YIELDS** | The `Doc` objects. ~~Doc~~  |

-## DocBin.merge {#merge tag="method"}
+## DocBin.merge {id="merge",tag="method"}

 Extend the annotations of this `DocBin` with the annotations from another. Will
 raise an error if the pre-defined `attrs` of the two `DocBin`s don't match.
@ -118,7 +117,7 @@ raise an error if the pre-defined `attrs` of the two `DocBin`s don't match.
 | -------- | ------------------------------------------------------ |
 | `other`  | The `DocBin` to merge into the current bin. ~~DocBin~~ |

-## DocBin.to_bytes {#to_bytes tag="method"}
+## DocBin.to_bytes {id="to_bytes",tag="method"}

 Serialize the `DocBin`'s annotations to a bytestring.

@ -134,7 +133,7 @@ Serialize the `DocBin`'s annotations to a bytestring.
 | ----------- | ---------------------------------- |
 | **RETURNS** | The serialized `DocBin`. ~~bytes~~ |

-## DocBin.from_bytes {#from_bytes tag="method"}
+## DocBin.from_bytes {id="from_bytes",tag="method"}

 Deserialize the `DocBin`'s annotations from a bytestring.

@ -150,7 +149,7 @@ Deserialize the `DocBin`'s annotations from a bytestring.
 | `bytes_data` | The data to load from. ~~bytes~~ |
 | **RETURNS**  | The loaded `DocBin`. ~~DocBin~~  |

-## DocBin.to_disk {#to_disk tag="method" new="3"}
+## DocBin.to_disk {id="to_disk",tag="method",version="3"}

 Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
 and the result can be used as the input data for
@ -168,7 +167,7 @@ and the result can be used as the input data for
 | -------- | -------------------------------------------------------------------------- |
 | `path`   | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ |

-## DocBin.from_disk {#from_disk tag="method" new="3"}
+## DocBin.from_disk {id="from_disk",tag="method",version="3"}

 Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.

--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@ -2,7 +2,7 @@
 title: EditTreeLemmatizer
 tag: class
 source: spacy/pipeline/edit_tree_lemmatizer.py
-new: 3.3
+version: 3.3
 teaser: 'Pipeline component for lemmatization'
 api_base_class: /api/pipe
 api_string_name: trainable_lemmatizer
@ -18,7 +18,7 @@ and construction method used by this lemmatizer were proposed in

 For a lookup and rule-based lemmatizer, see [`Lemmatizer`](/api/lemmatizer).

-## Assigned Attributes {#assigned-attributes}
+## Assigned Attributes {id="assigned-attributes"}

 Predictions are assigned to `Token.lemma`.

@ -27,7 +27,7 @@ Predictions are assigned to `Token.lemma`.
 | `Token.lemma`  | The lemma (hash). ~~int~~ |
 | `Token.lemma_` | The lemma. ~~str~~        |

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
@ -58,7 +58,7 @@ architectures and their arguments and hyperparameters.
 %%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
 ```

-## EditTreeLemmatizer.\_\_init\_\_ {#init tag="method"}
+## EditTreeLemmatizer.\_\_init\_\_ {id="init",tag="method"}

 > #### Example
 >
@ -91,7 +91,7 @@ shortcut for this and instantiate the component using its string name and
 | `top_k`         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                             |
 | `scorer`        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                     |

-## EditTreeLemmatizer.\_\_call\_\_ {#call tag="method"}
+## EditTreeLemmatizer.\_\_call\_\_ {id="call",tag="method"}

 Apply the pipe to one document. The document is modified in place, and returned.
 This usually happens under the hood when the `nlp` object is called on a text
@ -115,7 +115,40 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |

-## EditTreeLemmatizer.pipe {#pipe tag="method"}
+## EditTreeLemmatizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("trainable_lemmatizer")
+> student_pipe = student.add_pipe("trainable_lemmatizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
+## EditTreeLemmatizer.pipe {id="pipe",tag="method"}

 Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
@ -139,7 +172,7 @@ and [`pipe`](/api/edittreelemmatizer#pipe) delegate to the
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"}
+## EditTreeLemmatizer.initialize {id="initialize",tag="method",version="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. **At least one example
@ -176,7 +209,7 @@ config.
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |

-## EditTreeLemmatizer.predict {#predict tag="method"}
+## EditTreeLemmatizer.predict {id="predict",tag="method"}

 Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
 modifying them.
@ -193,7 +226,7 @@ modifying them.
 | `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
 | **RETURNS** | The model's prediction for each document.   |

-## EditTreeLemmatizer.set_annotations {#set_annotations tag="method"}
+## EditTreeLemmatizer.set_annotations {id="set_annotations",tag="method"}

 Modify a batch of [`Doc`](/api/doc) objects, using pre-computed tree
 identifiers.
@ -211,7 +244,7 @@ identifiers.
 | `docs`     | The documents to modify. ~~Iterable[Doc]~~                                            |
 | `tree_ids` | The identifiers of the edit trees to apply, produced by `EditTreeLemmatizer.predict`. |

-## EditTreeLemmatizer.update {#update tag="method"}
+## EditTreeLemmatizer.update {id="update",tag="method"}

 Learn from a batch of [`Example`](/api/example) objects containing the
 predictions and gold-standard annotations, and update the component's model.
@ -235,7 +268,7 @@ Delegates to [`predict`](/api/edittreelemmatizer#predict) and
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |

-## EditTreeLemmatizer.get_loss {#get_loss tag="method"}
+## EditTreeLemmatizer.get_loss {id="get_loss",tag="method"}

 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
@ -254,7 +287,7 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |

-## EditTreeLemmatizer.create_optimizer {#create_optimizer tag="method"}
+## EditTreeLemmatizer.create_optimizer {id="create_optimizer",tag="method"}

 Create an optimizer for the pipeline component.

@ -269,7 +302,28 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |

-## EditTreeLemmatizer.use_params {#use_params tag="method, contextmanager"}
+## EditTreeLemmatizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_lemmatizer = teacher.get_pipe("trainable_lemmatizer")
+> student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+> student_scores = student_lemmatizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_lemmatizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_lemmatizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
+## EditTreeLemmatizer.use_params {id="use_params",tag="method, contextmanager"}

 Modify the pipe's model, to use the given parameter values. At the end of the
 context, the original parameters are restored.
@ -286,7 +340,7 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |

-## EditTreeLemmatizer.to_disk {#to_disk tag="method"}
+## EditTreeLemmatizer.to_disk {id="to_disk",tag="method"}

 Serialize the pipe to disk.

@ -303,7 +357,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## EditTreeLemmatizer.from_disk {#from_disk tag="method"}
+## EditTreeLemmatizer.from_disk {id="from_disk",tag="method"}

 Load the pipe from disk. Modifies the object in place and returns it.

@ -321,7 +375,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `EditTreeLemmatizer` object. ~~EditTreeLemmatizer~~                                |

-## EditTreeLemmatizer.to_bytes {#to_bytes tag="method"}
+## EditTreeLemmatizer.to_bytes {id="to_bytes",tag="method"}

 > #### Example
 >
@ -338,7 +392,7 @@ Serialize the pipe to a bytestring.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `EditTreeLemmatizer` object. ~~bytes~~                           |

-## EditTreeLemmatizer.from_bytes {#from_bytes tag="method"}
+## EditTreeLemmatizer.from_bytes {id="from_bytes",tag="method"}

 Load the pipe from a bytestring. Modifies the object in place and returns it.

@ -357,7 +411,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `EditTreeLemmatizer` object. ~~EditTreeLemmatizer~~                                     |

-## EditTreeLemmatizer.labels {#labels tag="property"}
+## EditTreeLemmatizer.labels {id="labels",tag="property"}

 The labels currently added to the component.

@ -372,7 +426,7 @@ identifiers of edit trees.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |

-## EditTreeLemmatizer.label_data {#label_data tag="property" new="3"}
+## EditTreeLemmatizer.label_data {id="label_data",tag="property",version="3"}

 The labels currently added to the component and their internal meta information.
 This is the data generated by [`init labels`](/api/cli#init-labels) and used by
@ -390,7 +444,7 @@ initialize the model with a pre-defined label set.
 | ----------- | ---------------------------------------------------------- |
 | **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -2,7 +2,7 @@
 title: EntityLinker
 tag: class
 source: spacy/pipeline/entity_linker.py
-new: 2.2
+version: 2.2
 teaser: 'Pipeline component for named entity linking and disambiguation'
 api_base_class: /api/pipe
 api_string_name: entity_linker
@ -17,7 +17,7 @@ and a machine learning model to pick the right candidate, given the local
 context of the mention. `EntityLinker` defaults to using the
 [`InMemoryLookupKB`](/api/kb_in_memory) implementation.

-## Assigned Attributes {#assigned-attributes}
+## Assigned Attributes {id="assigned-attributes"}

 Predictions, in the form of knowledge base IDs, will be assigned to
 `Token.ent_kb_id_`.
@ -27,7 +27,7 @@ Predictions, in the form of knowledge base IDs, will be assigned to
 | `Token.ent_kb_id`  | Knowledge base ID (hash). ~~int~~ |
 | `Token.ent_kb_id_` | Knowledge base ID. ~~str~~        |

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
@ -72,7 +72,7 @@ architectures and their arguments and hyperparameters.
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
 ```

-## EntityLinker.\_\_init\_\_ {#init tag="method"}
+## EntityLinker.\_\_init\_\_ {id="init",tag="method"}

 > #### Example
 >
@ -115,7 +115,7 @@ custom knowledge base, you should either call
 | `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |

-## EntityLinker.\_\_call\_\_ {#call tag="method"}
+## EntityLinker.\_\_call\_\_ {id="call",tag="method"}

 Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
@ -138,7 +138,7 @@ delegate to the [`predict`](/api/entitylinker#predict) and
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |

-## EntityLinker.pipe {#pipe tag="method"}
+## EntityLinker.pipe {id="pipe",tag="method"}

 Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
@ -162,7 +162,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityLinker.set_kb {#set_kb tag="method" new="3"}
+## EntityLinker.set_kb {id="set_kb",tag="method",version="3"}

 The `kb_loader` should be a function that takes a `Vocab` instance and creates
 the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced
@ -184,7 +184,7 @@ with the current vocab.
 | ----------- | ---------------------------------------------------------------------------------------------------------------- |
 | `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |

-## EntityLinker.initialize {#initialize tag="method" new="3"}
+## EntityLinker.initialize {id="initialize",tag="method",version="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. **At least one example
@ -214,7 +214,7 @@ are synced with the current vocab.
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                       |
 | `kb_loader`    | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~                                                           |

-## EntityLinker.predict {#predict tag="method"}
+## EntityLinker.predict {id="predict",tag="method"}

 Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
 modifying them. Returns the KB IDs for each entity in each doc, including `NIL`
@ -232,7 +232,7 @@ if there is no prediction.
 | `docs`      | The documents to predict. ~~Iterable[Doc]~~                                |
 | **RETURNS** | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ |

-## EntityLinker.set_annotations {#set_annotations tag="method"}
+## EntityLinker.set_annotations {id="set_annotations",tag="method"}

 Modify a batch of documents, using pre-computed entity IDs for a list of named
 entities.
@ -250,7 +250,7 @@ entities.
 | `docs`   | The documents to modify. ~~Iterable[Doc]~~                                                                      |
 | `kb_ids` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. ~~List[str]~~ |

-## EntityLinker.update {#update tag="method"}
+## EntityLinker.update {id="update",tag="method"}

 Learn from a batch of [`Example`](/api/example) objects, updating both the
 pipe's entity linking model and context encoder. Delegates to
@ -273,7 +273,7 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |

-## EntityLinker.create_optimizer {#create_optimizer tag="method"}
+## EntityLinker.create_optimizer {id="create_optimizer",tag="method"}

 Create an optimizer for the pipeline component.

@ -288,7 +288,7 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |

-## EntityLinker.use_params {#use_params tag="method, contextmanager"}
+## EntityLinker.use_params {id="use_params",tag="method, contextmanager"}

 Modify the pipe's model, to use the given parameter values. At the end of the
 context, the original parameters are restored.
@ -305,7 +305,7 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |

-## EntityLinker.to_disk {#to_disk tag="method"}
+## EntityLinker.to_disk {id="to_disk",tag="method"}

 Serialize the pipe to disk.

@ -322,7 +322,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## EntityLinker.from_disk {#from_disk tag="method"}
+## EntityLinker.from_disk {id="from_disk",tag="method"}

 Load the pipe from disk. Modifies the object in place and returns it.

@ -340,7 +340,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `EntityLinker` object. ~~EntityLinker~~                                            |

-## EntityLinker.to_bytes {#to_bytes tag="method"}
+## EntityLinker.to_bytes {id="to_bytes",tag="method"}

 > #### Example
 >
@ -357,7 +357,7 @@ Serialize the pipe to a bytestring, including the `KnowledgeBase`.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `EntityLinker` object. ~~bytes~~                                 |

-## EntityLinker.from_bytes {#from_bytes tag="method"}
+## EntityLinker.from_bytes {id="from_bytes",tag="method"}

 Load the pipe from a bytestring. Modifies the object in place and returns it.

@ -376,7 +376,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `EntityLinker` object. ~~EntityLinker~~                                                 |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@ -20,7 +20,7 @@ your entities will be close to their initial tokens. If your entities are long
 and characterized by tokens in their middle, the component will likely not be a
 good fit for your task.

-## Assigned Attributes {#assigned-attributes}
+## Assigned Attributes {id="assigned-attributes"}

 Predictions will be saved to `Doc.ents` as a tuple. Each label will also be
 reflected to each underlying token, where it is saved in the `Token.ent_type`
@ -38,7 +38,7 @@ non-overlapping, or an error will be thrown.
 | `Token.ent_type`  | The label part of the named entity tag (hash). ~~int~~            |
 | `Token.ent_type_` | The label part of the named entity tag. ~~str~~                   |

-## Config and implementation {#config}
+## Config and implementation {id="config"}

 The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
@ -72,7 +72,7 @@ architectures and their arguments and hyperparameters.
 %%GITHUB_SPACY/spacy/pipeline/ner.pyx
 ```

-## EntityRecognizer.\_\_init\_\_ {#init tag="method"}
+## EntityRecognizer.\_\_init\_\_ {id="init",tag="method"}

 > #### Example
 >
@ -103,7 +103,7 @@ shortcut for this and instantiate the component using its string name and
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `incorrect_spans_key`         | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group in [`Doc.spans`](/api/doc#spans), under this key. Defaults to `None`. ~~Optional[str]~~                        |

-## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
+## EntityRecognizer.\_\_call\_\_ {id="call",tag="method"}

 Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
@ -127,7 +127,40 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |

-## EntityRecognizer.pipe {#pipe tag="method"}
+## EntityRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("ner")
+> student_pipe = student.add_pipe("ner")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
+## EntityRecognizer.pipe {id="pipe",tag="method"}

 Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
@ -151,7 +184,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityRecognizer.initialize {#initialize tag="method" new="3"}
+## EntityRecognizer.initialize {id="initialize",tag="method",version="3"}

 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. **At least one example
@ -188,7 +221,7 @@ config.
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                                   |
 | `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |

-## EntityRecognizer.predict {#predict tag="method"}
+## EntityRecognizer.predict {id="predict",tag="method"}

 Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
 modifying them.
@ -205,7 +238,7 @@ modifying them.
 | `docs`      | The documents to predict. ~~Iterable[Doc]~~                   |
 | **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ |

-## EntityRecognizer.set_annotations {#set_annotations tag="method"}
+## EntityRecognizer.set_annotations {id="set_annotations",tag="method"}

 Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

@ -222,7 +255,7 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
 | `docs`   | The documents to modify. ~~Iterable[Doc]~~                                                                                            |
 | `scores` | The scores to set, produced by `EntityRecognizer.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ |

-## EntityRecognizer.update {#update tag="method"}
+## EntityRecognizer.update {id="update",tag="method"}

 Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
 model. Delegates to [`predict`](/api/entityrecognizer#predict) and
@ -245,7 +278,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |

-## EntityRecognizer.get_loss {#get_loss tag="method"}
+## EntityRecognizer.get_loss {id="get_loss",tag="method"}

 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
@ -264,7 +297,28 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |

-## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
+## EntityRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_ner = teacher.get_pipe("ner")
+> student_ner = student.add_pipe("ner")
+> student_scores = student_ner.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_ner.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_ner.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
+## EntityRecognizer.create_optimizer {id="create_optimizer",tag="method"}

 Create an optimizer for the pipeline component.

@ -279,7 +333,7 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |

-## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
+## EntityRecognizer.use_params {id="use_params",tag="method, contextmanager"}

 Modify the pipe's model, to use the given parameter values. At the end of the
 context, the original parameters are restored.
@ -296,7 +350,7 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |

-## EntityRecognizer.add_label {#add_label tag="method"}
+## EntityRecognizer.add_label {id="add_label",tag="method"}

 Add a new label to the pipe. Note that you don't have to call this method if you
 provide a **representative data sample** to the [`initialize`](#initialize)
@ -316,7 +370,7 @@ to the model, and the output dimension will be
 | `label`     | The label to add. ~~str~~                                   |
 | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |

-## EntityRecognizer.set_output {#set_output tag="method"}
+## EntityRecognizer.set_output {id="set_output",tag="method"}

 Change the output dimension of the component's model by calling the model's
 attribute `resize_output`. This is a function that takes the original model and
@ -335,7 +389,7 @@ forgetting" problem.
 | ---- | --------------------------------- |
 | `nO` | The new output dimension. ~~int~~ |

-## EntityRecognizer.to_disk {#to_disk tag="method"}
+## EntityRecognizer.to_disk {id="to_disk",tag="method"}

 Serialize the pipe to disk.

@ -352,7 +406,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |

-## EntityRecognizer.from_disk {#from_disk tag="method"}
+## EntityRecognizer.from_disk {id="from_disk",tag="method"}

 Load the pipe from disk. Modifies the object in place and returns it.

@ -370,7 +424,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | **RETURNS**    | The modified `EntityRecognizer` object. ~~EntityRecognizer~~                                    |

-## EntityRecognizer.to_bytes {#to_bytes tag="method"}
+## EntityRecognizer.to_bytes {id="to_bytes",tag="method"}

 > #### Example
 >
@ -387,7 +441,7 @@ Serialize the pipe to a bytestring.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the `EntityRecognizer` object. ~~bytes~~                             |

-## EntityRecognizer.from_bytes {#from_bytes tag="method"}
+## EntityRecognizer.from_bytes {id="from_bytes",tag="method"}

 Load the pipe from a bytestring. Modifies the object in place and returns it.

@ -406,7 +460,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The `EntityRecognizer` object. ~~EntityRecognizer~~                                         |

-## EntityRecognizer.labels {#labels tag="property"}
+## EntityRecognizer.labels {id="labels",tag="property"}

 The labels currently added to the component.

@ -421,7 +475,7 @@ The labels currently added to the component.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |

-## EntityRecognizer.label_data {#label_data tag="property" new="3"}
+## EntityRecognizer.label_data {id="label_data",tag="property",version="3"}

 The labels currently added to the component and their internal meta information.
 This is the data generated by [`init labels`](/api/cli#init-labels) and used by
@ -439,7 +493,7 @@ the model with a pre-defined label set.
 | ----------- | ------------------------------------------------------------------------------- |
 | **RETURNS** | The label data added to the component. ~~Dict[str, Dict[str, Dict[str, int]]]~~ |

-## Serialization fields {#serialization-fields}
+## Serialization fields {id="serialization-fields"}

 During serialization, spaCy will export several data fields used to restore
 different aspects of the object. If needed, you can exclude them from
--- a/Show More
+++ b/Show More