Update cli.package for removed spacy.vectors.name attr

Merge pull request #13490 from svlandeg/feat/update_v4
Update v4 branch with latest from master
2025-10-02 18:06:46 +03:00 · 2024-09-01 16:43:49 +02:00 · 2024-05-14 22:41:17 +02:00 · 2024-05-14 18:45:51 +02:00 · 2024-05-14 18:38:11 +02:00 · 2024-05-14 17:42:48 +02:00
247 changed files with 7487 additions and 4544 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -30,7 +30,7 @@ jobs:
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
-          python-version: "3.7"
+          python-version: "3.9"

      - name: black
        run: |
@ -59,11 +59,7 @@ jobs:
        os: [ubuntu-latest, windows-latest, macos-latest]
        python_version: ["3.12"]
        include:
-          - os: windows-latest
-            python_version: "3.7"
-          - os: macos-latest
-            python_version: "3.8"
-          - os: ubuntu-latest
+          - os: ubuntu-20.04
            python_version: "3.9"
          - os: windows-latest
            python_version: "3.10"
@ -93,7 +89,6 @@ jobs:
      - name: Run mypy
        run: |
          python -m mypy spacy
-        if: matrix.python_version != '3.7'

      - name: Delete source directory and .egg-info
        run: |
@ -115,22 +110,22 @@ jobs:
      - name: Test import
        run: python -W error -c "import spacy"

-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #      - name: "Test download CLI"
+      #        run: |
+      #          python -m spacy download ca_core_news_sm
+      #          python -m spacy download ca_core_news_md
+      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test download_url in info CLI"
+      #        run: |
+      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test no warnings on load (#11713)"
+      #        run: |
+      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'

      - name: "Test convert CLI"
        run: |
@ -154,17 +149,17 @@ jobs:
          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
        if: matrix.python_version == '3.9'

-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+      #      - name: "Test assemble CLI"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test assemble CLI vectors warning"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      #        if: matrix.python_version == '3.9'

      - name: "Install test requirements"
        run: |
@ -173,10 +168,4 @@ jobs:
      - name: "Run CPU tests"
        run: |
          python -m pytest --pyargs spacy -W error
-        if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
-
-      - name: "Run CPU tests with thinc-apple-ops"
-        run: |
-          python -m pip install 'spacy[apple]'
-          python -m pytest --pyargs spacy
-        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
+        if: matrix.python_version == '3.11'
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@ -25,7 +25,7 @@ jobs:
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
-          python-version: "3.7"
+          python-version: "3.9"

      - name: Validate website/meta/universe.json
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,11 +1,11 @@
 repos:
-   repo: https://github.com/ambv/black
+  - repo: https://github.com/ambv/black
    rev: 22.3.0
    hooks:
      - id: black
-      language_version: python3.7
-      additional_dependencies: ['click==8.0.4']
-   repo: https://github.com/pycqa/flake8
+        language_version: python3.9
+        additional_dependencies: ["click==8.0.4"]
+  - repo: https://github.com/pycqa/flake8
    rev: 5.0.4
    hooks:
      - id: flake8
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -276,7 +276,7 @@ except:  # noqa: E722

 ### Python conventions

-All Python code must be written **compatible with Python 3.6+**. More detailed
+All Python code must be written **compatible with Python 3.9+**. More detailed
 code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).

 #### I/O and handling paths
--- a/2
+++ b/2
@ -5,7 +5,7 @@ override SPACY_EXTRAS = spacy-lookups-data==1.0.3
 endif

 ifndef PYVER
-override PYVER = 3.8
+override PYVER = 3.9
 endif

 VENV := ./env$(PYVER)
--- a/README.md
+++ b/README.md
@ -33,7 +33,7 @@ open-source software, released under the
 ## 📖 Documentation

 | Documentation                                                                                                                                                                                   |                                                                                                                                                                                                                                                                                                       |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | ⭐️ **[spaCy 101]**                                                                                                                                                                             | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                     |
 | 📚 **[Usage Guides]**                                                                                                                                                                           | How to use spaCy and its features.                                                                                                                                                                                                                                                                    |
 | 🚀 **[New in v3.0]**                                                                                                                                                                            | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                        |
@ -115,7 +115,7 @@ For detailed installation instructions, see the

 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
  Studio)
- **Python version**: Python 3.7+ (only 64 bit)
+- **Python version**: Python 3.9+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)

 [pip]: https://pypi.org/project/spacy/
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,6 +1,2 @@
 # build version constraints for use with wheelwright
-numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
-numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
-numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
-numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy>=1.25.0; python_version>='3.9'
--- a/extra/DEVELOPER_DOCS/Satellite
+++ b/extra/DEVELOPER_DOCS/Satellite
@ -31,7 +31,6 @@ These are repos that can be used by spaCy but aren&#39;t part of a default insta
 - [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford&#39;s Stanza library in spaCy.
 - [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
 - [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
- [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple&#39;s native libraries for improved performance.
 - [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
 - [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.

@ -79,4 +78,3 @@ Repos that don&#39;t fit in any of the above categories.
 - [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
 - [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
 - [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.
-
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
+    "thinc>=9.0.0,<9.1.0",
    "numpy>=1.15.0; python_version < '3.9'",
    "numpy>=1.25.0; python_version >= '3.9'",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,9 @@
 # Our libraries
-spacy-legacy>=3.0.11,<3.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
@ -31,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
--- a/setup.cfg
+++ b/setup.cfg
@ -17,8 +17,6 @@ classifiers =
    Operating System :: Microsoft :: Windows
    Programming Language :: Cython
    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.7
-    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
@ -31,26 +29,15 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.7
-# NOTE: This section is superseded by pyproject.toml and will be removed in
-# spaCy v4
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0; python_version < "3.9"
-    numpy>=1.19.0; python_version >= "3.9"
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
+python_requires = >=3.9
 install_requires =
    # Our libraries
-    spacy-legacy>=3.0.11,<3.1.0
+    spacy-legacy>=4.0.0.dev1,<4.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
+    thinc>=9.0.0,<9.1.0
    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
@ -66,7 +53,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
@ -116,14 +102,12 @@ cuda12x =
    cupy-cuda12x>=11.5.0,<13.0.0
 cuda-autodetect =
    cupy-wheel>=11.0.0,<13.0.0
-apple =
-    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.5.2,!=0.6.1
    sudachidict_core>=20211220
 ko =
-    natto-py>=0.9.0
+    mecab-ko>=1.0.0
 th =
    pythainlp>=2.0

--- a/setup.py
+++ b/setup.py
@ -37,7 +37,6 @@ MOD_NAMES = [
    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
-    "spacy.pipeline.multitask",
    "spacy.pipeline.ner",
    "spacy.pipeline.pipe",
    "spacy.pipeline.trainable_pipe",
@ -48,6 +47,7 @@ MOD_NAMES = [
    "spacy.pipeline._parser_internals.arc_eager",
    "spacy.pipeline._parser_internals.ner",
    "spacy.pipeline._parser_internals.nonproj",
+    "spacy.pipeline._parser_internals.search",
    "spacy.pipeline._parser_internals._state",
    "spacy.pipeline._parser_internals.stateclass",
    "spacy.pipeline._parser_internals.transition_system",
@ -61,12 +61,13 @@ MOD_NAMES = [
    "spacy.tokens.span_group",
    "spacy.tokens.graph",
    "spacy.tokens.morphanalysis",
-    "spacy.tokens._retokenize",
+    "spacy.tokens.retokenizer",
    "spacy.matcher.matcher",
    "spacy.matcher.phrasematcher",
    "spacy.matcher.dependencymatcher",
    "spacy.symbols",
    "spacy.vectors",
+    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
    "msvc": ["/Ox", "/EHsc"],
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,9 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "4.0.0.dev3"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
+__projects__ = "https://github.com/explosion/projects"
+__projects_branch__ = "v3"
+__lookups_tag__ = "v1.0.3"
+__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,99 +1,50 @@
-# Reserve 64 values for flag features
 from . cimport symbols


 cdef enum attr_id_t:
-    NULL_ATTR
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
-    LIKE_URL
-    LIKE_NUM
-    LIKE_EMAIL
-    IS_STOP
-    IS_OOV_DEPRECATED
-    IS_BRACKET
-    IS_QUOTE
-    IS_LEFT_PUNCT
-    IS_RIGHT_PUNCT
-    IS_CURRENCY
+    NULL_ATTR = 0
+    IS_ALPHA = symbols.IS_ALPHA
+    IS_ASCII = symbols.IS_ASCII
+    IS_DIGIT = symbols.IS_DIGIT
+    IS_LOWER = symbols.IS_LOWER
+    IS_PUNCT = symbols.IS_PUNCT
+    IS_SPACE = symbols.IS_SPACE
+    IS_TITLE = symbols.IS_TITLE
+    IS_UPPER = symbols.IS_UPPER
+    LIKE_URL = symbols.LIKE_URL
+    LIKE_NUM = symbols.LIKE_NUM
+    LIKE_EMAIL = symbols.LIKE_EMAIL
+    IS_STOP = symbols.IS_STOP
+    IS_BRACKET = symbols.IS_BRACKET
+    IS_QUOTE = symbols.IS_QUOTE
+    IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
+    IS_CURRENCY = symbols.IS_CURRENCY

-    FLAG19 = 19
-    FLAG20
-    FLAG21
-    FLAG22
-    FLAG23
-    FLAG24
-    FLAG25
-    FLAG26
-    FLAG27
-    FLAG28
-    FLAG29
-    FLAG30
-    FLAG31
-    FLAG32
-    FLAG33
-    FLAG34
-    FLAG35
-    FLAG36
-    FLAG37
-    FLAG38
-    FLAG39
-    FLAG40
-    FLAG41
-    FLAG42
-    FLAG43
-    FLAG44
-    FLAG45
-    FLAG46
-    FLAG47
-    FLAG48
-    FLAG49
-    FLAG50
-    FLAG51
-    FLAG52
-    FLAG53
-    FLAG54
-    FLAG55
-    FLAG56
-    FLAG57
-    FLAG58
-    FLAG59
-    FLAG60
-    FLAG61
-    FLAG62
-    FLAG63
+    ID = symbols.ID
+    ORTH = symbols.ORTH
+    LOWER = symbols.LOWER
+    NORM = symbols.NORM
+    SHAPE = symbols.SHAPE
+    PREFIX = symbols.PREFIX
+    SUFFIX = symbols.SUFFIX

-    ID
-    ORTH
-    LOWER
-    NORM
-    SHAPE
-    PREFIX
-    SUFFIX
+    LENGTH = symbols.LENGTH
+    CLUSTER = symbols.CLUSTER
+    LEMMA = symbols.LEMMA
+    POS = symbols.POS
+    TAG = symbols.TAG
+    DEP = symbols.DEP
+    ENT_IOB = symbols.ENT_IOB
+    ENT_TYPE = symbols.ENT_TYPE
+    HEAD = symbols.HEAD
+    SENT_START = symbols.SENT_START
+    SPACY = symbols.SPACY
+    PROB = symbols.PROB

-    LENGTH
-    CLUSTER
-    LEMMA
-    POS
-    TAG
-    DEP
-    ENT_IOB
-    ENT_TYPE
-    HEAD
-    SENT_START
-    SPACY
-    PROB
-
-    LANG
+    LANG = symbols.LANG
    ENT_KB_ID = symbols.ENT_KB_ID
-    MORPH
+    MORPH = symbols.MORPH
    ENT_ID = symbols.ENT_ID

-    IDX
-    SENT_END
+    IDX = symbols.IDX
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -17,57 +17,11 @@ IDS = {
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
-    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "IS_CURRENCY": IS_CURRENCY,
-    "FLAG19": FLAG19,
-    "FLAG20": FLAG20,
-    "FLAG21": FLAG21,
-    "FLAG22": FLAG22,
-    "FLAG23": FLAG23,
-    "FLAG24": FLAG24,
-    "FLAG25": FLAG25,
-    "FLAG26": FLAG26,
-    "FLAG27": FLAG27,
-    "FLAG28": FLAG28,
-    "FLAG29": FLAG29,
-    "FLAG30": FLAG30,
-    "FLAG31": FLAG31,
-    "FLAG32": FLAG32,
-    "FLAG33": FLAG33,
-    "FLAG34": FLAG34,
-    "FLAG35": FLAG35,
-    "FLAG36": FLAG36,
-    "FLAG37": FLAG37,
-    "FLAG38": FLAG38,
-    "FLAG39": FLAG39,
-    "FLAG40": FLAG40,
-    "FLAG41": FLAG41,
-    "FLAG42": FLAG42,
-    "FLAG43": FLAG43,
-    "FLAG44": FLAG44,
-    "FLAG45": FLAG45,
-    "FLAG46": FLAG46,
-    "FLAG47": FLAG47,
-    "FLAG48": FLAG48,
-    "FLAG49": FLAG49,
-    "FLAG50": FLAG50,
-    "FLAG51": FLAG51,
-    "FLAG52": FLAG52,
-    "FLAG53": FLAG53,
-    "FLAG54": FLAG54,
-    "FLAG55": FLAG55,
-    "FLAG56": FLAG56,
-    "FLAG57": FLAG57,
-    "FLAG58": FLAG58,
-    "FLAG59": FLAG59,
-    "FLAG60": FLAG60,
-    "FLAG61": FLAG61,
-    "FLAG62": FLAG62,
-    "FLAG63": FLAG63,
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
@ -93,12 +47,11 @@ IDS = {
 }


-# ATTR IDs, in order of the symbol
-NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+NAMES = {v: k for k, v in IDS.items()}
 locals().update(IDS)


-def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+def intify_attrs(stringy_attrs, strings_map=None):
    """
    Normalize a dictionary of attributes, converting them to ints.

@ -110,75 +63,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        converted to ints.
    """
    inty_attrs = {}
-    if _do_deprecated:
-        if "F" in stringy_attrs:
-            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if "L" in stringy_attrs:
-            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if "pos" in stringy_attrs:
-            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")  # no-cython-lint
-        if "number" in stringy_attrs:
-            stringy_attrs.pop("number")
-        if "tenspect" in stringy_attrs:
-            stringy_attrs.pop("tenspect")
-        morph_keys = [
-            "PunctType",
-            "PunctSide",
-            "Other",
-            "Degree",
-            "AdvType",
-            "Number",
-            "VerbForm",
-            "PronType",
-            "Aspect",
-            "Tense",
-            "PartType",
-            "Poss",
-            "Hyph",
-            "ConjType",
-            "NumType",
-            "Foreign",
-            "VerbType",
-            "NounType",
-            "Gender",
-            "Mood",
-            "Negative",
-            "Tense",
-            "Voice",
-            "Abbr",
-            "Derivation",
-            "Echo",
-            "Foreign",
-            "NameType",
-            "NounType",
-            "NumForm",
-            "NumValue",
-            "PartType",
-            "Polite",
-            "StyleVariant",
-            "PronType",
-            "AdjType",
-            "Person",
-            "Variant",
-            "AdpType",
-            "Reflex",
-            "Negative",
-            "Mood",
-            "Aspect",
-            "Case",
-            "Polarity",
-            "PrepCase",
-            "Animacy",  # U20
-        ]
-        for key in morph_keys:
-            if key in stringy_attrs:
-                stringy_attrs.pop(key)
-            elif key.lower() in stringy_attrs:
-                stringy_attrs.pop(key.lower())
-            elif key.upper() in stringy_attrs:
-                stringy_attrs.pop(key.upper())
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -14,6 +14,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
+from .distill import distill  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .find_function import find_function  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -11,6 +11,7 @@ from typing import (
    Dict,
    Iterable,
    List,
+    Literal,
    Optional,
    Tuple,
    Union,
@ -28,7 +29,7 @@ from wasabi import Printer, msg
 from weasel import app as project_cli

 from .. import about
-from ..compat import Literal
+from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import validate
 from ..util import (
    ENV_VARS,
@ -148,6 +149,16 @@ def _parse_override(value: Any) -> Any:
        return str(value)


+def _handle_renamed_language_codes(lang: Optional[str]) -> None:
+    # Throw error for renamed language codes in v4
+    if lang in RENAMED_LANGUAGE_CODES:
+        msg.fail(
+            title="Renamed language code",
+            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
+            exits=1,
+        )
+
+
@contextmanager
 def show_validation_error(
    file_path: Optional[Union[str, Path]] = None,
@ -192,6 +203,13 @@ def show_validation_error(
        msg.fail("Config validation error", e, exits=1)


+def import_code_paths(code_paths: str) -> None:
+    """Helper to import comma-separated list of code paths."""
+    code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
+    for code_path in code_paths:
+        import_code(code_path)
+
+
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
    """Helper to import Python file provided in training commands / commands
    using the config. This makes custom registered functions available.
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@ -11,7 +11,7 @@ from ._util import (
    Arg,
    Opt,
    app,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    show_validation_error,
 )
@ -26,7 +26,7 @@ def assemble_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    # fmt: on
 ):
@ -46,7 +46,7 @@ def assemble_cli(
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -16,7 +16,7 @@ from ..training.converters import (
    iob_to_docs,
    json_to_docs,
 )
-from ._util import Arg, Opt, app, walk_directory
+from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory

 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
@ -116,6 +116,10 @@ def convert(
    input_path = Path(input_path)
    if not msg:
        msg = Printer(no_print=silent)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(input_path, converter):
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -13,7 +13,7 @@ from ._util import (
    Arg,
    Opt,
    debug_cli,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    show_validation_error,
 )
@ -27,7 +27,7 @@ def debug_config_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
    # fmt: on
@ -44,7 +44,7 @@ def debug_config_cli(
    DOCS: https://spacy.io/api/cli#debug-config
    """
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    debug_config(
        config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
    )
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -7,6 +7,7 @@ from typing import (
    Dict,
    Iterable,
    List,
+    Literal,
    Optional,
    Sequence,
    Set,
@ -22,7 +23,6 @@ import typer
 from wasabi import MESSAGES, Printer, msg

 from .. import util
-from ..compat import Literal
 from ..language import Language
 from ..morphology import Morphology
 from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
@ -40,7 +40,7 @@ from ._util import (
    _format_number,
    app,
    debug_cli,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    show_validation_error,
 )
@ -72,7 +72,7 @@ def debug_data_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
    no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@ -92,7 +92,7 @@ def debug_data_cli(
            "--help for an overview of the other available debugging commands."
        )
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    debug_data(
        config_path,
        config_overrides=overrides,
@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
    word_counts: Counter = Counter()
    for doc in docs:
        for token in doc:
-            # Normalize the text
-            t = token.text.lower().replace("``", '"').replace("''", '"')
+            t = token.text.lower()
            word_counts[t] += 1
    if normalize:
        total = sum(word_counts.values(), 0.0)
--- a/spacy/cli/distill.py
+++ b/spacy/cli/distill.py
@ -0,0 +1,98 @@
+import logging
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import typer
+from wasabi import msg
+
+from .. import util
+from ..pipeline.trainable_pipe import TrainablePipe
+from ..schemas import ConfigSchemaDistill
+from ..training.initialize import init_nlp_student
+from ..training.loop import distill as distill_nlp
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code_paths,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
+
+
+@app.command(
+    "distill",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def distill_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    teacher_model: str = Arg(..., help="Teacher model name or path"),
+    student_config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    """
+    Distill a spaCy pipeline from a teacher model.
+
+    DOCS: https://spacy.io/api/cli#distill
+    """
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    overrides = parse_config_overrides(ctx.args)
+    import_code_paths(code_path)
+    distill(
+        teacher_model,
+        student_config_path,
+        output_path,
+        use_gpu=use_gpu,
+        overrides=overrides,
+    )
+
+
+def distill(
+    teacher_model: Union[str, Path],
+    student_config_path: Union[str, Path],
+    output_path: Optional[Union[str, Path]] = None,
+    *,
+    use_gpu: int = -1,
+    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
+):
+    student_config_path = util.ensure_path(student_config_path)
+    output_path = util.ensure_path(output_path)
+    # Make sure all files and paths exist if they are needed
+    if not student_config_path or (
+        str(student_config_path) != "-" and not student_config_path.exists()
+    ):
+        msg.fail("Student config file not found", student_config_path, exits=1)
+    if not output_path:
+        msg.info("No output directory provided")
+    else:
+        if not output_path.exists():
+            output_path.mkdir(parents=True)
+            msg.good(f"Created output directory: {output_path}")
+        msg.info(f"Saving to output directory: {output_path}")
+    setup_gpu(use_gpu)
+    teacher = util.load_model(teacher_model)
+    with show_validation_error(student_config_path):
+        config = util.load_config(
+            student_config_path, overrides=overrides, interpolate=False
+        )
+    msg.divider("Initializing student pipeline")
+    with show_validation_error(student_config_path, hint_fill=False):
+        student = init_nlp_student(config, teacher, use_gpu=use_gpu)
+
+    msg.good("Initialized student pipeline")
+    msg.divider("Distilling student pipeline from teacher")
+    distill_nlp(
+        teacher,
+        student,
+        output_path,
+        use_gpu=use_gpu,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -7,9 +7,10 @@ import typer
 from wasabi import msg

 from .. import about
-from ..errors import OLD_MODEL_SHORTCUTS
 from ..util import (
+    get_installed_models,
    get_minor_version,
+    get_package_version,
    is_in_interactive,
    is_in_jupyter,
    is_package,
@ -76,15 +77,17 @@ def download(
        version = components[-1]
    else:
        model_name = model
-        if model in OLD_MODEL_SHORTCUTS:
-            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
-                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
-            )
-            model_name = OLD_MODEL_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)

+    # If we already have this version installed, skip downloading
+    installed = get_installed_models()
+    if model_name in installed:
+        installed_version = get_package_version(model_name)
+        if installed_version == version:
+            msg.warn(f"{model_name} v{version} already installed, skipping")
+            return
+
    filename = get_model_filename(model_name, version, sdist)

    download_model(filename, pip_args)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -10,7 +10,7 @@ from .. import displacy, util
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu


@benchmark_cli.command(
@ -22,7 +22,7 @@ def evaluate_cli(
    model: str = Arg(..., help="Model name or path"),
    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
    output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@ -43,7 +43,7 @@ def evaluate_cli(

    DOCS: https://spacy.io/api/cli#benchmark-accuracy
    """
-    import_code(code_path)
+    import_code_paths(code_path)
    evaluate(
        model,
        data_path,
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,3 +1,4 @@
+import importlib.metadata
 import json
 import platform
 from pathlib import Path
@ -7,7 +8,6 @@ import srsly
 from wasabi import MarkdownRenderer, Printer

 from .. import about, util
-from ..compat import importlib_metadata
 from ._util import Arg, Opt, app, string_to_list
 from .download import get_latest_version, get_model_filename

@ -137,7 +137,7 @@ def info_installed_model_url(model: str) -> Optional[str]:
    dist-info available.
    """
    try:
-        dist = importlib_metadata.distribution(model)
+        dist = importlib.metadata.distribution(model)
        text = dist.read_text("direct_url.json")
        if isinstance(text, str):
            data = json.loads(text)
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -9,13 +9,14 @@ from thinc.api import Config
 from wasabi import Printer, diff_strings

 from .. import util
-from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
+from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
 from ._util import (
    COMMAND,
    Arg,
    Opt,
+    _handle_renamed_language_codes,
    import_code,
    init_cli,
    show_validation_error,
@ -50,7 +51,7 @@ class InitValues:
 def init_config_cli(
    # fmt: off
    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
-    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
+    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
    pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
    optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
    gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@ -90,6 +91,7 @@ def init_fill_config_cli(
    # fmt: off
    base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
    output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
+    distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@ -105,13 +107,20 @@ def init_fill_config_cli(
    DOCS: https://spacy.io/api/cli#init-fill-config
    """
    import_code(code_path)
-    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
+    fill_config(
+        output_file,
+        base_path,
+        distillation=distillation,
+        pretraining=pretraining,
+        diff=diff,
+    )


 def fill_config(
    output_file: Path,
    base_path: Path,
    *,
+    distillation: bool = False,
    pretraining: bool = False,
    diff: bool = False,
    silent: bool = False,
@ -130,6 +139,9 @@ def fill_config(
    # replaced with their actual config after loading, so we have to re-add them
    sourced = util.get_sourced_components(config)
    filled["components"].update(sourced)
+    if distillation:
+        distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
+        filled = distillation_config.merge(filled)
    if pretraining:
        validate_config_for_pretrain(filled, msg)
        pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
@ -165,6 +177,10 @@ def init_config(
    msg = Printer(no_print=silent)
    with TEMPLATE_PATH.open("r") as f:
        template = Template(f.read())
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
    # Filter out duplicates since tok2vec and transformer are added by template
    pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
    defaults = RECOMMENDATIONS["__default__"]
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -12,6 +12,7 @@ from ..training.initialize import convert_vectors, init_nlp
 from ._util import (
    Arg,
    Opt,
+    _handle_renamed_language_codes,
    import_code,
    init_cli,
    parse_config_overrides,
@ -29,7 +30,6 @@ def init_vectors_cli(
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
-    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
@ -39,8 +39,11 @@ def init_vectors_cli(
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
-    if verbose:
-        util.logger.setLevel(logging.DEBUG)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
@ -50,7 +53,6 @@ def init_vectors_cli(
        vectors_loc,
        truncate=truncate,
        prune=prune,
-        name=name,
        mode=mode,
        attr=attr,
    )
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,3 +1,4 @@
+import importlib.metadata
 import os
 import re
 import shutil
@ -13,7 +14,6 @@ from thinc.api import Config
 from wasabi import MarkdownRenderer, Printer, get_raw_input

 from .. import about, util
-from ..compat import importlib_metadata
 from ..schemas import ModelMetaSchema, validate
 from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list

@ -23,7 +23,7 @@ def package_cli(
    # fmt: off
    input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
+    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
    create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
@ -250,9 +250,9 @@ def has_build() -> bool:
    # in an editable install), so an import check is not sufficient; instead
    # check that there is a package version
    try:
-        importlib_metadata.version("build")
+        importlib.metadata.version("build")
        return True
-    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
+    except importlib.metadata.PackageNotFoundError:  # type: ignore[attr-defined]
        return False


@ -352,7 +352,6 @@ def get_meta(
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
        "keys": nlp.vocab.vectors.n_keys,
-        "name": nlp.vocab.vectors.name,
    }
    if about.__title__ != "spacy":
        meta["parent_package"] = about.__title__
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -11,7 +11,7 @@ from ._util import (
    Arg,
    Opt,
    app,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
@ -27,7 +27,7 @@ def pretrain_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@ -56,7 +56,7 @@ def pretrain_cli(
    DOCS: https://spacy.io/api/cli#pretrain
    """
    config_overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
    setup_gpu(use_gpu)
    msg.info(f"Loading config from: {config_path}")
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -238,7 +238,7 @@ grad_factor = 1.0
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
-get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+get_candidates = {"@misc":"spacy.CandidateGenerator.v2"}
 incl_context = true
 incl_prior = true

@ -517,7 +517,7 @@ width = ${components.tok2vec.model.encode.width}
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
-get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+get_candidates = {"@misc":"spacy.CandidateGenerator.v2"}
 incl_context = true
 incl_prior = true

--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -13,7 +13,7 @@ from ._util import (
    Arg,
    Opt,
    app,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
@ -28,7 +28,7 @@ def train_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
@ -50,7 +50,7 @@ def train_cli(
    if verbose:
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)


--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -23,19 +23,6 @@ try:
 except ImportError:
    cupy = None

-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
-# Important note: The importlib_metadata "backport" includes functionality
-# that's not part of the built-in importlib.metadata. We should treat this
-# import like the built-in and only use what's available there.
-try:  # Python 3.8+
-    import importlib.metadata as importlib_metadata
-except ImportError:
-    from catalogue import _importlib_metadata as importlib_metadata  # type: ignore[no-redef]    # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401

 pickle = pickle
--- a/spacy/default_config_distillation.cfg
+++ b/spacy/default_config_distillation.cfg
@ -0,0 +1,34 @@
+[paths]
+raw_text = null
+
+[distillation]
+corpus = "corpora.distillation"
+dropout = 0.1
+max_epochs = 1
+max_steps = 0
+student_to_teacher = {}
+
+[distillation.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 3000
+discard_oversize = false
+tolerance = 0.2
+
+[distillation.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 1e-4
+
+[corpora]
+
+[corpora.distillation]
+@readers = "spacy.PlainTextCorpus.v1"
+path = ${paths.raw_text}
+min_length = 0
+max_length = 0
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,6 +1,7 @@
 import warnings
+from typing import Literal

-from .compat import Literal
+from . import about


 class ErrorsWithCodes(type):
@ -83,7 +84,7 @@ class Warnings(metaclass=ErrorsWithCodes):
            "ignoring the duplicate entry.")
    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
            "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
-    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+    W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
            "the Knowledge Base.")
    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
            "you are constructing a parse tree incrementally by setting "
@ -104,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
            "table. This may degrade the performance of the model to some "
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed and load the table in your config. The "
-            "languages with lexeme normalization tables are currently: "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "If this is surprising, make sure you are loading the table in "
+            "your config. The languages with lexeme normalization tables are "
+            "currently: {langs}\n\nAn example of how to load a table in "
+            "your config :\n\n"
            "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
            "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
            "tables = [\"lexeme_norm\"]\n")
    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
            "attribute or operator.")
@ -132,13 +134,6 @@ class Warnings(metaclass=ErrorsWithCodes):
            "and make it independent. For example, `replace_listeners = "
            "[\"model.tok2vec\"]` See the documentation for details: "
            "https://spacy.io/usage/training#config-components-listeners")
-    W088 = ("The pipeline component {name} implements a `begin_training` "
-            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
-            "has been renamed to `initialize`, so you likely want to rename the "
-            "component method. See the documentation for details: "
-            "https://spacy.io/api/language#initialize")
-    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
-            "to `nlp.initialize`.")
    W090 = ("Could not locate any {format} files in path '{path}'.")
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -222,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
    W126 = ("These keys are unsupported: {unsupported}")
    W127 = ("Not all `Language.pipe` worker processes completed successfully")

+    # v4 warning strings
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
+

 class Errors(metaclass=ErrorsWithCodes):
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@ -256,9 +256,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "https://spacy.io/usage/models")
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
-    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
    E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
            "refers to an issue with the `Vocab` or `StringStore`.")
    E019 = ("Can't create transition with unknown action ID: {action}. Action "
@ -470,13 +468,13 @@ class Errors(metaclass=ErrorsWithCodes):
            "same, but found '{nlp}' and '{vocab}' respectively.")
    E152 = ("The attribute {attr} is not supported for token patterns. "
            "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
    E153 = ("The value type {vtype} is not supported for token patterns. "
            "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
    E154 = ("One of the attributes or values is not supported for token "
            "patterns. Please use the option `validate=True` with the Matcher, "
-            "PhraseMatcher, or EntityRuler for more details.")
+            "PhraseMatcher, or SpanRuler for more details.")
    E155 = ("The pipeline needs to include a {pipe} in order to use "
            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@ -500,7 +498,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "Current DocBin: {current}\nOther DocBin: {other}")
    E169 = ("Can't find module: {module}")
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
            "callable or None, but got: {arg_type}")
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
@ -739,13 +737,6 @@ class Errors(metaclass=ErrorsWithCodes):
            "method in component '{name}'. If you want to use this "
            "method, make sure it's overwritten on the subclass.")
    E940 = ("Found NaN values in scores.")
-    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
-            "load the model, use its full name instead:\n\n"
-            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models and if "
-            "you want to create a blank model, use spacy.blank: "
-            "nlp = spacy.blank(\"{name}\")")
    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
            "return an initialized nlp object but got: {value}. Maybe "
            "you forgot to return the modified object in your function?")
@ -759,7 +750,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "loaded nlp object, but got: {source}")
    E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
            "a string value from {expected} but got: '{arg}'")
-    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
            "a list, but got: {arg_type}")
    E949 = ("Unable to align tokens for the predicted and reference docs. It "
            "is only possible to align the docs when both texts are the same "
@ -933,8 +924,6 @@ class Errors(metaclass=ErrorsWithCodes):
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
-             "exist.")
    E1024 = ("A pattern with {attr_type} '{label}' is not present in "
             "'{component}' patterns.")
    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
@ -945,7 +934,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E1029 = ("Edit tree cannot be applied to form.")
    E1030 = ("Edit tree identifier out of range.")
    E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
    E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
    E1034 = ("Node index {i} out of bounds ({length})")
    E1035 = ("Token index {i} out of bounds ({length})")
@ -962,7 +951,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "case pass an empty list for the previously not specified argument to avoid this error.")
    E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
             "{value}.")
-    E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
    E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
             "method in '{name}'. If you want to use this method, make "
             "sure it's overwritten on the subclass.")
@ -989,15 +977,35 @@ class Errors(metaclass=ErrorsWithCodes):
             "reduction. Please enable one of `use_reduce_first`, "
             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")

+    # v4 error strings
+    E4000 = ("Expected a Doc as input, but got: '{type}'")
+    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
+             "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")
+    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
+    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
+             "{existing_value}.")
+    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
+    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
+    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
+             "[initialize] or in registered lookups (spacy-lookups-data). An "
+             "example for how to load lemmatizer tables in [initialize]:\n\n"
+             "[initialize.components]\n\n"
+             "[initialize.components.{pipe_name}]\n\n"
+             "[initialize.components.{pipe_name}.lookups]\n"
+             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
+             "tables = {tables}\n"
+             "# or required tables only: tables = {required_tables}\n")
+    E4011 = ("Server error ({status_code}), couldn't fetch {url}")

-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}

+RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

 # fmt: on

--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -1,11 +1,10 @@
-from .candidate import Candidate, get_candidates, get_candidates_batch
+from .candidate import Candidate, InMemoryCandidate
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB

 __all__ = [
    "Candidate",
    "KnowledgeBase",
+    "InMemoryCandidate",
    "InMemoryLookupKB",
-    "get_candidates",
-    "get_candidates_batch",
 ]
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -1,15 +1,17 @@
 from libcpp.vector cimport vector

 from ..typedefs cimport hash_t
-from .kb cimport KnowledgeBase
+from .kb_in_memory cimport InMemoryLookupKB


-# Object used by the Entity Linker that summarizes one entity-alias candidate
-# combination.
 cdef class Candidate:
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
-    cdef float entity_freq
-    cdef vector[float] entity_vector
-    cdef hash_t alias_hash
-    cdef float prior_prob
+    pass
+
+
+cdef class InMemoryCandidate(Candidate):
+    cdef readonly hash_t _entity_hash
+    cdef readonly hash_t _alias_hash
+    cdef vector[float] _entity_vector
+    cdef float _prior_prob
+    cdef readonly InMemoryLookupKB _kb
+    cdef float _entity_freq
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -1,90 +1,98 @@
 # cython: infer_types=True

-from typing import Iterable
+from .kb_in_memory cimport InMemoryLookupKB

-from .kb cimport KnowledgeBase
-
-from ..tokens import Span
+from ..errors import Errors


 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or
-    may not be resolved to a specific `entity` from a Knowledge Base. This
-    will be used as input for the entity linking algorithm which will
-    disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned a certain prior probability.
+    """A `Candidate` object refers to a textual mention that may or may not be resolved
+    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
+    is assigned a certain prior probability.

    DOCS: https://spacy.io/api/kb/#candidate-init
    """

+    def __init__(self):
+        # Make sure abstract Candidate is not instantiated.
+        if self.__class__ == Candidate:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
+
+    @property
+    def entity_id(self) -> int:
+        """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
+        otherwise the hash of the entity ID string)."""
+        raise NotImplementedError
+
+    @property
+    def entity_id_(self) -> str:
+        """RETURNS (str): String representation of entity ID."""
+        raise NotImplementedError
+
+    @property
+    def entity_vector(self) -> vector[float]:
+        """RETURNS (vector[float]): Entity vector."""
+        raise NotImplementedError
+
+
+cdef class InMemoryCandidate(Candidate):
+    """Candidate for InMemoryLookupKB."""
+
    def __init__(
        self,
-        KnowledgeBase kb,
-        entity_hash,
-        entity_freq,
-        entity_vector,
-        alias_hash,
-        prior_prob
+        kb: InMemoryLookupKB,
+        entity_hash: int,
+        alias_hash: int,
+        entity_vector: vector[float],
+        prior_prob: float,
+        entity_freq: float
    ):
-        self.kb = kb
-        self.entity_hash = entity_hash
-        self.entity_freq = entity_freq
-        self.entity_vector = entity_vector
-        self.alias_hash = alias_hash
-        self.prior_prob = prior_prob
+        """
+        kb (InMemoryLookupKB]): InMemoryLookupKB instance.
+        entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
+        entity_freq (int): Entity frequency in KB corpus.
+        entity_vector (List[float]): Entity embedding.
+        alias_hash (int): Alias hash.
+        prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
+            the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
+        """
+        super().__init__()
+
+        self._entity_hash = entity_hash
+        self._entity_vector = entity_vector
+        self._prior_prob = prior_prob
+        self._kb = kb
+        self._alias_hash = alias_hash
+        self._entity_freq = entity_freq

    @property
-    def entity(self) -> int:
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
-        return self.entity_hash
+    def entity_id(self) -> int:
+        return self._entity_hash

    @property
-    def entity_(self) -> str:
-        """RETURNS (str): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_hash]
-
-    @property
-    def alias(self) -> int:
-        """RETURNS (uint64): hash of the alias"""
-        return self.alias_hash
-
-    @property
-    def alias_(self) -> str:
-        """RETURNS (str): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias_hash]
-
-    @property
-    def entity_freq(self) -> float:
-        return self.entity_freq
-
-    @property
-    def entity_vector(self) -> Iterable[float]:
-        return self.entity_vector
+    def entity_vector(self) -> vector[float]:
+        return self._entity_vector

    @property
    def prior_prob(self) -> float:
-        return self.prior_prob
+        """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
+        this entity."""
+        return self._prior_prob

+    @property
+    def alias(self) -> str:
+        """RETURNS (str): Alias."""
+        return self._kb.vocab.strings[self._alias_hash]

-def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
-    """
-    Return candidate entities for a given mention and fetching appropriate
-    entries from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Span): Entity mention for which to identify candidates.
-    RETURNS (Iterable[Candidate]): Identified candidates.
-    """
-    return kb.get_candidates(mention)
+    @property
+    def entity_id_(self) -> str:
+        return self._kb.vocab.strings[self._entity_hash]

-
-def get_candidates_batch(
-        kb: KnowledgeBase, mentions: Iterable[Span]
-) -> Iterable[Iterable[Candidate]]:
-    """
-    Return candidate entities for the given mentions and fetching appropriate entries
-    from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Iterable[Span]): Entity mentions for which to identify candidates.
-    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
-    """
-    return kb.get_candidates_batch(mentions)
+    @property
+    def entity_freq(self) -> float:
+        """RETURNS (float): Entity frequency in KB corpus."""
+        return self._entity_freq
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -1,14 +1,14 @@
 # cython: infer_types=True

 from pathlib import Path
-from typing import Iterable, Tuple, Union
+from typing import Iterable, Iterator, Tuple, Union

 from cymem.cymem cimport Pool

 from ..errors import Errors
-from ..tokens import Span
+from ..tokens import SpanGroup
 from ..util import SimpleFrozenList
-from .candidate import Candidate
+from .candidate cimport Candidate


 cdef class KnowledgeBase:
@ -19,6 +19,8 @@ cdef class KnowledgeBase:

    DOCS: https://spacy.io/api/kb
    """
+    CandidatesForMentionT = Iterable[Candidate]
+    CandidatesForDocT = Iterable[CandidatesForMentionT]

    def __init__(self, vocab: Vocab, entity_vector_length: int):
        """Create a KnowledgeBase."""
@ -32,27 +34,15 @@ cdef class KnowledgeBase:
        self.entity_vector_length = entity_vector_length
        self.mem = Pool()

-    def get_candidates_batch(
-        self, mentions: Iterable[Span]
-    ) -> Iterable[Iterable[Candidate]]:
+    def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[CandidatesForDocT]:
        """
-        Return candidate entities for specified texts. Each candidate defines
-        the entity, the original alias, and the prior probability of that
-        alias resolving to that entity.
-        If no candidate is found for a given text, an empty list is returned.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
-        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
-        """
-        return [self.get_candidates(span) for span in mentions]
-
-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
-        """
-        Return candidate entities for specified text. Each candidate defines
-        the entity, the original alias,
-        and the prior probability of that alias resolving to that entity.
-        If the no candidate is found for a given text, an empty list is returned.
-        mention (Span): Mention for which to get candidates.
-        RETURNS (Iterable[Candidate]): Identified candidates.
+        Return candidate entities for the specified groups of mentions (as SpanGroup) per Doc.
+        Each candidate for a mention defines at least the entity and the entity's embedding vector. Depending on the KB
+        implementation, further properties - such as the prior probability of the specified mention text resolving to
+        that entity - might be included.
+        If no candidates are found for a given mention, an empty list is returned.
+        mentions (Iterator[SpanGroup]): Mentions for which to get candidates.
+        RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mention/doc/doc batch.
        """
        raise NotImplementedError(
            Errors.E1045.format(
@ -128,3 +118,10 @@ cdef class KnowledgeBase:
                parent="KnowledgeBase", method="from_disk", name=self.__name__
            )
        )
+
+    @property
+    def supports_prior_probs(self) -> bool:
+        """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
+        )
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True
-from typing import Any, Callable, Dict, Iterable
+from typing import Any, Callable, Dict, Iterable, Iterator

 import srsly

@ -12,7 +12,7 @@ from preshed.maps cimport PreshMap
 import warnings
 from pathlib import Path

-from ..tokens import Span
+from ..tokens import SpanGroup

 from ..typedefs cimport hash_t

@ -23,7 +23,7 @@ from ..util import SimpleFrozenList, ensure_path
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase

-from .candidate import Candidate as Candidate
+from .candidate import InMemoryCandidate


 cdef class InMemoryLookupKB(KnowledgeBase):
@ -255,10 +255,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry

-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+    def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[InMemoryCandidate]]]:
+        for mentions_for_doc in mentions:
+            yield [self._get_alias_candidates(span.text) for span in mentions_for_doc]

-    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
        """
        Return candidate entities for an alias. Each candidate defines the
        entity, the original alias, and the prior probability of that alias
@ -271,18 +272,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]

-        return [Candidate(kb=self,
+        return [
+            InMemoryCandidate(
+                kb=self,
                entity_hash=self._entries[entry_index].entity_hash,
-                          entity_freq=self._entries[entry_index].freq,
-                          entity_vector=self._vectors_table[
-                              self._entries[entry_index].vector_index
-                          ],
                alias_hash=alias_hash,
-                          prior_prob=prior_prob)
-                for (entry_index, prior_prob) in zip(
-                    alias_entry.entry_indices, alias_entry.probs
+                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                prior_prob=prior_prob,
+                entity_freq=self._entries[entry_index].freq
            )
-                if entry_index != 0]
+            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+            if entry_index != 0
+        ]

    def get_vector(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings[entity]
@ -316,6 +317,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):

        return 0.0

+    def supports_prior_probs(self) -> bool:
+        return True
+
    def to_bytes(self, **kwargs):
        """Serialize the current state to a binary string.
        """
--- a/spacy/lang/isl/init.py
+++ b/spacy/lang/isl/init.py
@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):


 class Icelandic(Language):
-    lang = "is"
+    lang = "isl"
    Defaults = IcelandicDefaults


--- a/spacy/lang/isl/stop_words.py
+++ b/spacy/lang/isl/stop_words.py
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -17,21 +17,100 @@ DEFAULT_CONFIG = """

 [nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
+mecab_args = ""
 """


@registry.tokenizers("spacy.ko.KoreanTokenizer")
-def create_tokenizer():
+def create_tokenizer(mecab_args: str):
    def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp.vocab)
+        return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)

    return korean_tokenizer_factory


 class KoreanTokenizer(DummyTokenizer):
+    def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
+        self.vocab = vocab
+        mecab = try_mecab_import()
+        self.mecab_tokenizer = mecab.Tagger(mecab_args)
+
+    def __reduce__(self):
+        return KoreanTokenizer, (self.vocab,)
+
+    def __call__(self, text: str) -> Doc:
+        dtokens = list(self.detailed_tokens(text))
+        surfaces = [dt["surface"] for dt in dtokens]
+        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
+        for token, dtoken in zip(doc, dtokens):
+            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
+            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
+            token.lemma_ = dtoken["lemma"]
+        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
+        return doc
+
+    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
+        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
+        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
+        for line in self.mecab_tokenizer.parse(text).split("\n"):
+            if line == "EOS":
+                break
+            surface, _, expr = line.partition("\t")
+            features = expr.split("/")[0].split(",")
+            tag = features[0]
+            lemma = "*"
+            if len(features) >= 8:
+                lemma = features[7]
+            if lemma == "*":
+                lemma = surface
+            yield {"surface": surface, "lemma": lemma, "tag": tag}
+
+    def score(self, examples):
+        validate_examples(examples, "KoreanTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
+
+class KoreanDefaults(BaseDefaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
+    infixes = TOKENIZER_INFIXES
+
+
+class Korean(Language):
+    lang = "ko"
+    Defaults = KoreanDefaults
+
+
+def try_mecab_import():
+    try:
+        import mecab_ko as MeCab
+
+        return MeCab
+    except ImportError:
+        raise ImportError(
+            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
+            "the python package `mecab-ko`: pip install mecab-ko"
+        ) from None
+
+
+@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
+def create_natto_tokenizer():
+    def korean_natto_tokenizer_factory(nlp):
+        return KoreanNattoTokenizer(nlp.vocab)
+
+    return korean_natto_tokenizer_factory
+
+
+class KoreanNattoTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab):
        self.vocab = vocab
-        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab = self._try_mecab_import()  # type: ignore[func-returns-value]
        self._mecab_tokenizer = None

    @property
@ -47,7 +126,7 @@ class KoreanTokenizer(DummyTokenizer):
        return self._mecab_tokenizer

    def __reduce__(self):
-        return KoreanTokenizer, (self.vocab,)
+        return KoreanNattoTokenizer, (self.vocab,)

    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
@ -74,7 +153,7 @@ class KoreanTokenizer(DummyTokenizer):
            feature = node.feature
            tag, _, expr = feature.partition(",")
            lemma, _, remainder = expr.partition("/")
-            if lemma == "*":
+            if lemma == "*" or lemma == "":
                lemma = surface
            yield {"surface": surface, "lemma": lemma, "tag": tag}

@ -82,28 +161,14 @@ class KoreanTokenizer(DummyTokenizer):
        validate_examples(examples, "KoreanTokenizer.score")
        return Scorer.score_tokenization(examples)

-
-class KoreanDefaults(BaseDefaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
-    lex_attr_getters = LEX_ATTRS
-    stop_words = STOP_WORDS
-    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
-    infixes = TOKENIZER_INFIXES
-
-
-class Korean(Language):
-    lang = "ko"
-    Defaults = KoreanDefaults
-
-
-def try_mecab_import() -> None:
+    def _try_mecab_import(self):
        try:
            from natto import MeCab

            return MeCab
        except ImportError:
            raise ImportError(
-            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
+                'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
                "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
                "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
                "and [natto-py](https://github.com/buruzaemon/natto-py)"
--- a/spacy/lang/mul/init.py
+++ b/spacy/lang/mul/init.py
@ -3,10 +3,10 @@ from ...language import Language

 class MultiLanguage(Language):
    """Language class to be used for models that support multiple languages.
-    This module allows models to specify their language ID as 'xx'.
+    This module allows models to specify their language ID as 'mul'.
    """

-    lang = "xx"
+    lang = "mul"


 __all__ = ["MultiLanguage"]
--- a/spacy/lang/mul/examples.py
+++ b/spacy/lang/mul/examples.py
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -16,10 +16,6 @@ URL_PATTERN = (
    r"(?:\S+(?::\S*)?@)?"
    r"(?:"
    # IP address exclusion
-    # private & local networks
-    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
-    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
-    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -31,7 +31,7 @@ segmenter = "char"
 [initialize]

 [initialize.tokenizer]
-pkuseg_model = null
+pkuseg_model = "spacy_ontonotes"
 pkuseg_user_dict = "default"
 """

--- a/spacy/language.py
+++ b/spacy/language.py
@ -18,6 +18,7 @@ from typing import (
    Iterable,
    Iterator,
    List,
+    Literal,
    NoReturn,
    Optional,
    Pattern,
@ -34,7 +35,6 @@ import srsly
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops

 from . import about, ty, util
-from .compat import Literal
 from .errors import Errors, Warnings
 from .git_info import GIT_VERSION
 from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -52,7 +52,7 @@ from .scorer import Scorer
 from .tokenizer import Tokenizer
 from .tokens import Doc
 from .tokens.underscore import Underscore
-from .training import Example, validate_examples
+from .training import Example, validate_distillation_examples, validate_examples
 from .training.initialize import init_tok2vec, init_vocab
 from .util import (
    _DEFAULT_EMPTY_PIPES,
@ -74,6 +74,9 @@ PipeCallable = Callable[[Doc], Doc]
 # This is the base config will all settings (training etc.)
 DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
 DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
+# This is the base config for the [distillation] block and currently not included
+# in the main config and only added via the 'init fill-config' command
+DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
@ -127,13 +130,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory


-@registry.misc("spacy.LookupsDataLoader.v1")
-def load_lookups_data(lang, tables):
-    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
-    lookups = load_lookups(lang=lang, tables=tables)
-    return lookups
-
-
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -198,8 +194,7 @@ class Language:
        if not isinstance(vocab, Vocab) and vocab is not True:
            raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
        if vocab is True:
-            vectors_name = meta.get("vectors", {}).get("name")
-            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
+            vocab = create_vocab(self.lang, self.Defaults)
            if not create_vectors:
                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
                create_vectors = registry.resolve(vectors_cfg)["vectors"]
@ -257,7 +252,6 @@ class Language:
            "width": self.vocab.vectors_length,
            "vectors": len(self.vocab.vectors),
            "keys": self.vocab.vectors.n_keys,
-            "name": self.vocab.vectors.name,
            "mode": self.vocab.vectors.mode,
        }
        self._meta["labels"] = dict(self.pipe_labels)
@ -768,8 +762,8 @@ class Language:
        *,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
        source: Optional["Language"] = None,
        config: Dict[str, Any] = SimpleFrozenDict(),
        raw_config: Optional[Config] = None,
@ -788,8 +782,8 @@ class Language:
            component directly before.
        after (Union[str, int]): Name or index of the component to insert new
            component directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
        source (Language): Optional loaded nlp object to copy the pipeline
            component from.
        config (Dict[str, Any]): Config parameters to use for this component.
@ -835,18 +829,22 @@ class Language:
        self,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
    ) -> int:
        """Determine where to insert a pipeline component based on the before/
        after/first/last values.

        before (str): Name or index of the component to insert directly before.
        after (str): Name or index of component to insert directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
        RETURNS (int): The index of the new pipeline component.
        """
+        if first is not None and first is not True:
+            raise ValueError(Errors.E4009.format(attr="first", value=first))
+        if last is not None and last is not True:
+            raise ValueError(Errors.E4009.format(attr="last", value=last))
        all_args = {"before": before, "after": after, "first": first, "last": last}
        if sum(arg is not None for arg in [before, after, first, last]) >= 2:
            raise ValueError(
@ -1056,6 +1054,116 @@ class Language:
                raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
        return doc

+    def distill(
+        self,
+        teacher: "Language",
+        examples: Iterable[Example],
+        *,
+        drop: float = 0.0,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
+        losses: Optional[Dict[str, float]] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
+        student_to_teacher: Optional[Dict[str, str]] = None,
+    ):
+        """Distill the models in a student pipeline from a teacher pipeline.
+        teacher (Language): Teacher to distill from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
+        drop (float): The dropout rate.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
+        losses (Optional(Dict[str, float])): Dictionary to update with the loss,
+            keyed by component.
+        component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
+            for specific pipeline components, keyed by component name.
+        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
+        student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
+            teacher pipe name, only needed for pipes where the student pipe
+            name does not match the teacher pipe name.
+        RETURNS (Dict[str, float]): The updated losses dictionary
+
+        DOCS: https://spacy.io/api/language#distill
+        """
+        if student_to_teacher is None:
+            student_to_teacher = {}
+        if losses is None:
+            losses = {}
+        if isinstance(examples, list) and len(examples) == 0:
+            return losses
+
+        validate_distillation_examples(examples, "Language.distill")
+        examples = _copy_examples(examples, copy_x=True, copy_y=True)
+
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = self.create_optimizer()
+            sgd = self._optimizer
+
+        if component_cfg is None:
+            component_cfg = {}
+        pipe_kwargs = {}
+        for student_name, student_proc in self.pipeline:
+            component_cfg.setdefault(student_name, {})
+            pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
+            component_cfg[student_name].setdefault("drop", drop)
+            pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
+
+        teacher_pipes = dict(teacher.pipeline)
+        for student_name, student_proc in self.pipeline:
+            if student_name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=student_proc,
+                        name=student_name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[student_name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
+
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+            ):
+                # A missing teacher pipe is not an error, some student pipes
+                # do not need a teacher, such as tok2vec layer losses.
+                teacher_name = (
+                    student_to_teacher[student_name]
+                    if student_name in student_to_teacher
+                    else student_name
+                )
+                teacher_pipe = teacher_pipes.get(teacher_name, None)
+                student_proc.distill(
+                    teacher_pipe,
+                    examples,
+                    sgd=None,
+                    losses=losses,
+                    **component_cfg[student_name],
+                )
+
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for student_name, student_proc in self.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and sgd not in (None, False)
+            ):
+                student_proc.finish_update(sgd)
+
+        return losses
+
    def disable_pipes(self, *names) -> "DisabledPipes":
        """Disable one or more pipeline components. If used as a context
        manager, the pipeline will be restored to the initial state at the end
@ -1144,7 +1252,7 @@ class Language:
        _: Optional[Any] = None,
        *,
        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
@ -1155,7 +1263,9 @@ class Language:
        examples (Iterable[Example]): A batch of examples
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@ -1188,17 +1298,12 @@ class Language:
            component_cfg[name].setdefault("drop", drop)
            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
-            # ignore statements are used here because mypy ignores hasattr
-            if name not in exclude and hasattr(proc, "update"):
-                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
-            if sgd not in (None, False):
            if (
                name not in exclude
                and isinstance(proc, ty.TrainableComponent)
                and proc.is_trainable
-                    and proc.model not in (True, False, None)
            ):
-                    proc.finish_update(sgd)
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
            if name in annotates:
                for doc, eg in zip(
                    _pipe(
@ -1211,6 +1316,18 @@ class Language:
                    examples,
                ):
                    eg.predicted = doc
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for name, proc in self.pipeline:
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+                and sgd not in (None, False)
+            ):
+                proc.finish_update(sgd)
+
        return losses

    def rehearse(
@ -1277,25 +1394,20 @@ class Language:
            sgd(key, W, dW)  # type: ignore[call-arg, misc]
        return losses

-    def begin_training(
-        self,
-        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
-        *,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
-        warnings.warn(Warnings.W089, DeprecationWarning)
-        return self.initialize(get_examples, sgd=sgd)
-
    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
+        labels: Optional[Dict[str, Any]] = None,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
        """Initialize the pipe for training, using data examples if available.

        get_examples (Callable[[], Iterable[Example]]): Optional function that
            returns gold-standard Example objects.
+        labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
+            using the names of the pipes as keys. Overrides labels that are in
+            the model configuration.
        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
            provided, will be created using the .create_optimizer() method.
        RETURNS (thinc.api.Optimizer): The optimizer.
@ -1343,6 +1455,8 @@ class Language:
        for name, proc in self.pipeline:
            if isinstance(proc, ty.InitializableComponent):
                p_settings = I["components"].get(name, {})
+                if labels is not None and name in labels:
+                    p_settings["labels"] = labels[name]
                p_settings = validate_init_settings(
                    proc.initialize, p_settings, section="components", name=name
                )
@ -1816,6 +1930,7 @@ class Language:
        # using the nlp.config with all defaults.
        config = util.copy_config(config)
        orig_pipeline = config.pop("components", {})
+        orig_distill = config.pop("distillation", None)
        orig_pretraining = config.pop("pretraining", None)
        config["components"] = {}
        if auto_fill:
@ -1824,6 +1939,9 @@ class Language:
            filled = config
        filled["components"] = orig_pipeline
        config["components"] = orig_pipeline
+        if orig_distill is not None:
+            filled["distillation"] = orig_distill
+            config["distillation"] = orig_distill
        if orig_pretraining is not None:
            filled["pretraining"] = orig_pretraining
            config["pretraining"] = orig_pretraining
@ -2176,9 +2294,6 @@ class Language:
            if path.exists():
                data = srsly.read_json(path)
                self.meta.update(data)
-                # self.meta always overrides meta["vectors"] with the metadata
-                # from self.vocab.vectors, so set the name directly
-                self.vocab.vectors.name = data.get("vectors", {}).get("name")

        def deserialize_vocab(path: Path) -> None:
            if path.exists():
@ -2247,9 +2362,6 @@ class Language:
        def deserialize_meta(b):
            data = srsly.json_loads(b)
            self.meta.update(data)
-            # self.meta always overrides meta["vectors"] with the metadata
-            # from self.vocab.vectors, so set the name directly
-            self.vocab.vectors.name = data.get("vectors", {}).get("name")

        deserializers: Dict[str, Callable[[bytes], Any]] = {}
        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
@ -2316,13 +2428,18 @@ class DisabledPipes(list):
        self[:] = []


-def _copy_examples(examples: Iterable[Example]) -> List[Example]:
+def _copy_examples(
+    examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
+) -> List[Example]:
    """Make a copy of a batch of examples, copying the predicted Doc as well.
    This is used in contexts where we need to take ownership of the examples
    so that they can be mutated, for instance during Language.evaluate and
    Language.update.
    """
-    return [Example(eg.x.copy(), eg.y) for eg in examples]
+    return [
+        Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
+        for eg in examples
+    ]


 def _apply_pipes(
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -12,7 +12,6 @@ from .attrs cimport (
    SUFFIX,
    attr_id_t,
 )
-from .strings cimport StringStore
 from .structs cimport LexemeC
 from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@ -19,7 +19,6 @@ class Lexeme:
    def vector_norm(self) -> float: ...
    vector: Floats1d
    rank: int
-    sentiment: float
    @property
    def orth_(self) -> str: ...
    @property
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -57,7 +57,7 @@ cdef class Lexeme:
        """
        self.vocab = vocab
        self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
        if self.c.orth != orth:
            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

@ -193,20 +193,6 @@ cdef class Lexeme:
    def rank(self, value):
        self.c.id = value

-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the lexeme."""
-        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
-        return sentiment_table.get(self.c.orth, 0.0)
-
-    @sentiment.setter
-    def sentiment(self, float x):
-        if "lexeme_sentiment" not in self.vocab.lookups:
-            self.vocab.lookups.add_table("lexeme_sentiment")
-        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
-        sentiment_table[self.c.orth] = x
-
    @property
    def orth_(self):
        """RETURNS (str): The original verbatim text of the lexeme
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -2,16 +2,40 @@ from collections import OrderedDict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union

+import requests
 import srsly
 from preshed.bloom import BloomFilter

 from .errors import Errors
 from .strings import get_string_id
-from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
+from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry

 UNSET = object()


+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
+@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
+def load_lookups_data_from_url(lang, tables, url):
+    logger.debug(f"Loading lookups from {url}: {tables}")
+    lookups = Lookups()
+    for table in tables:
+        table_url = url + lang + "_" + table + ".json"
+        r = requests.get(table_url)
+        if r.status_code != 200:
+            raise ValueError(
+                Errors.E4011.format(status_code=r.status_code, url=table_url)
+            )
+        table_data = r.json()
+        lookups.add_table(table, table_data)
+    return lookups
+
+
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
    """Load the data from the spacy-lookups-data package for a given language,
    if available. Returns an empty `Lookups` container if there's no data or if the package
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -175,9 +175,9 @@ cdef class DependencyMatcher:
        on_match (callable): Optional callback executed on match.
        """
        if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -5,13 +5,13 @@ from typing import (
    Iterable,
    Iterator,
    List,
+    Literal,
    Optional,
    Tuple,
    Union,
    overload,
 )

-from ..compat import Literal
 from ..tokens import Doc, Span
 from ..vocab import Vocab

--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -20,6 +20,12 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t

+from ..errors import Errors, MatchPatternError, Warnings
+from ..schemas import validate_token_pattern
+from .levenshtein import levenshtein_compare
+
+from ..strings cimport get_string_id
+
 from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
@ -113,9 +119,9 @@ cdef class Matcher:
        """
        errors = {}
        if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
        if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
            raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
        for i, pattern in enumerate(patterns):
@ -275,6 +281,10 @@ cdef class Matcher:
        # non-overlapping ones this `match` can be either (start, end) or
        # (start, end, alignments) depending on `with_alignments=` option.
        for key, *match in matches:
+            # Adjust span matches to doc offsets
+            if isinstance(doclike, Span):
+                match[0] += doclike.start
+                match[1] += doclike.start
            span_filter = self._filter.get(key)
            if span_filter is not None:
                pairs = pairs_by_id.get(key, [])
@ -305,9 +315,6 @@ cdef class Matcher:
        if as_spans:
            final_results = []
            for key, start, end, *_ in final_matches:
-                if isinstance(doclike, Span):
-                    start += doclike.start
-                    end += doclike.start
                final_results.append(Span(doc, start, end, label=key))
        elif with_alignments:
            # convert alignments List[Dict[str, int]] --> List[int]
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@ -1,6 +1,5 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload

-from ..compat import Literal
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
@ -21,6 +20,15 @@ class PhraseMatcher:
            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
        ] = ...,
    ) -> None: ...
+    def _add_from_arrays(
+        self,
+        key: str,
+        specs: List[List[int]],
+        *,
+        on_match: Optional[
+            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+        ] = ...,
+    ) -> None: ...
    def remove(self, key: str) -> None: ...
    @overload
    def __call__(
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -1,4 +1,7 @@
 # cython: infer_types=True
+from collections import defaultdict
+from typing import List
+
 from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set

 import warnings
@ -39,7 +42,7 @@ cdef class PhraseMatcher:
        """
        self.vocab = vocab
        self._callbacks = {}
-        self._docs = {}
+        self._docs = defaultdict(set)
        self._validate = validate

        self.mem = Pool()
@ -155,41 +158,69 @@ cdef class PhraseMatcher:
        del self._callbacks[key]
        del self._docs[key]

-    def add(self, key, docs, *_docs, on_match=None):
-        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
-        key, an on_match callback, and one or more patterns.
-
-        Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
-        second argument, with the on_match callback as an optional keyword
-        argument.
+    def _add_from_arrays(self, key, specs, *, on_match=None):
+        """Add a preprocessed list of specs, with an optional callback.

        key (str): The match ID.
-        docs (list): List of `Doc` objects representing match patterns.
+        specs (List[List[int]]): A list of lists of hashes to match.
        on_match (callable): Callback executed on match.
-        *_docs (Doc): For backwards compatibility: list of patterns to add
-            as variable arguments. Will be ignored if a list of patterns is
-            provided as the second argument.
-
-        DOCS: https://spacy.io/api/phrasematcher#add
        """
-        if docs is None or hasattr(docs, "__call__"):  # old API
-            on_match = docs
-            docs = _docs
-
-        _ = self.vocab[key]
-        self._callbacks[key] = on_match
-        self._docs.setdefault(key, set())
-
        cdef MapStruct* current_node
        cdef MapStruct* internal_node
        cdef void* result

+        self._callbacks[key] = on_match
+        for spec in specs:
+            self._docs[key].add(tuple(spec))
+
+            current_node = self.c_map
+            for token in spec:
+                if token == self._terminal_hash:
+                    warnings.warn(Warnings.W021)
+                    break
+                result = <MapStruct*>map_get(current_node, token)
+                if not result:
+                    internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
+                    map_init(self.mem, internal_node, 8)
+                    map_set(self.mem, current_node, token, internal_node)
+                    result = internal_node
+                current_node = <MapStruct*>result
+            result = <MapStruct*>map_get(current_node, self._terminal_hash)
+            if not result:
+                internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
+                map_init(self.mem, internal_node, 8)
+                map_set(self.mem, current_node, self._terminal_hash, internal_node)
+                result = internal_node
+            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
+
+    def add(self, key, docs, *, on_match=None):
+        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
+        key, a list of one or more patterns, and (optionally) an on_match callback.
+
+        key (str): The match ID.
+        docs (list): List of `Doc` objects representing match patterns.
+        on_match (callable): Callback executed on match.
+
+        If any of the input Docs are invalid, no internal state will be updated.
+
+        DOCS: https://spacy.io/api/phrasematcher#add
+        """
        if isinstance(docs, Doc):
            raise ValueError(Errors.E179.format(key=key))
+        if docs is None or not isinstance(docs, List):
+            raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
+        if on_match is not None and not hasattr(on_match, "__call__"):
+            raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
+
+        _ = self.vocab[key]
+        specs = []
+
        for doc in docs:
            if len(doc) == 0:
                continue
-            if isinstance(doc, Doc):
+            if not isinstance(doc, Doc):
+                raise ValueError(Errors.E4000.format(type=type(doc)))
+
            attrs = (TAG, POS, MORPH, LEMMA, DEP)
            has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
            for attr in attrs:
@ -208,30 +239,9 @@ cdef class PhraseMatcher:
                    and self.attr not in attrs:
                string_attr = self.vocab.strings[self.attr]
                warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
-                keyword = self._convert_to_array(doc)
-            else:
-                keyword = doc
-            self._docs[key].add(tuple(keyword))
+            specs.append(self._convert_to_array(doc))

-            current_node = self.c_map
-            for token in keyword:
-                if token == self._terminal_hash:
-                    warnings.warn(Warnings.W021)
-                    break
-                result = <MapStruct*>map_get(current_node, token)
-                if not result:
-                    internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
-                    map_init(self.mem, internal_node, 8)
-                    map_set(self.mem, current_node, token, internal_node)
-                    result = internal_node
-                current_node = <MapStruct*>result
-            result = <MapStruct*>map_get(current_node, self._terminal_hash)
-            if not result:
-                internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
-                map_init(self.mem, internal_node, 8)
-                map_set(self.mem, current_node, self._terminal_hash, internal_node)
-                result = internal_node
-            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
+        self._add_from_arrays(key, specs, on_match=on_match)

    def __call__(self, object doclike, *, as_spans=False):
        """Find all sequences matching the supplied patterns on the `Doc`.
@ -345,7 +355,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
    matcher = PhraseMatcher(vocab, attr=attr)
    for key, specs in docs.items():
        callback = callbacks.get(key, None)
-        matcher.add(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
    return matcher


--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
    "update",
    "rehearse",
    "get_loss",
+    "get_teacher_student_loss",
    "initialize",
    "begin_update",
    "finish_update",
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Callable, Iterable, List, Optional, Tuple
+from typing import Callable, Iterable, Iterator, List, Optional, Tuple

 from thinc.api import (
    Linear,
@ -15,18 +15,15 @@ from thinc.api import (
 from thinc.types import Floats2d

 from ...errors import Errors
-from ...kb import (
-    Candidate,
-    InMemoryLookupKB,
-    KnowledgeBase,
-    get_candidates,
-    get_candidates_batch,
-)
-from ...tokens import Doc, Span
+from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
+from ...tokens import Doc, Span, SpanGroup
 from ...util import registry
 from ...vocab import Vocab
 from ..extract_spans import extract_spans

+CandidatesForMentionT = Iterable[Candidate]
+CandidatesForDocT = Iterable[CandidatesForMentionT]
+

@registry.architectures("spacy.EntityLinker.v2")
 def build_nel_encoder(
@ -123,12 +120,38 @@ def empty_kb(


@registry.misc("spacy.CandidateGenerator.v1")
-def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
+def create_get_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
    return get_candidates


-@registry.misc("spacy.CandidateBatchGenerator.v1")
-def create_candidates_batch() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+@registry.misc("spacy.CandidateGenerator.v2")
+def create_get_candidates_v2() -> Callable[
+    [KnowledgeBase, Iterator[SpanGroup]], Iterator[CandidatesForDocT]
 ]:
-    return get_candidates_batch
+    return get_candidates_v2
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    """
+    Return candidate entities for the given mention from the KB.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Span): Entity mention.
+    RETURNS (Iterable[Candidate]): Identified candidates for specified mention.
+    """
+    cands_per_doc = next(
+        get_candidates_v2(kb, iter([SpanGroup(mention.doc, spans=[mention])]))
+    )
+    assert isinstance(cands_per_doc, list)
+    return next(cands_per_doc[0])
+
+
+def get_candidates_v2(
+    kb: KnowledgeBase, mentions: Iterator[SpanGroup]
+) -> Iterator[Iterable[Iterable[Candidate]]]:
+    """
+    Return candidate entities for the given mentions from the KB.
+    kb (KnowledgeBase): Knowledge base to query.
+    mentions (Iterator[SpanGroup]): Mentions per doc.
+    RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mentions in document/SpanGroup.
+    """
+    return kb.get_candidates(mentions)
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -1,9 +1,8 @@
-from typing import List, Optional, cast
+from typing import List, Literal, Optional

 from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d

-from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -19,6 +19,7 @@ from thinc.api import (
    clone,
    concatenate,
    list2ragged,
+    noop,
    reduce_first,
    reduce_last,
    reduce_max,
@ -148,55 +149,26 @@ def build_text_classifier_v2(
    linear_model: Model[List[Doc], Floats2d],
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
-    # TODO: build the model with _build_parametric_attention_with_residual_nonlinear
-    # in spaCy v4. We don't do this in spaCy v3 to preserve model
-    # compatibility.
-    exclusive_classes = not linear_model.attrs["multi_label"]
-    with Model.define_operators({">>": chain, "|": concatenate}):
    width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention(width)
-        maxout_layer = Maxout(nO=width, nI=width)
-        norm_layer = LayerNorm(nI=width)
-        cnn_model = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(maxout_layer >> norm_layer >> Dropout(0.0))
+    exclusive_classes = not linear_model.attrs["multi_label"]
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=noop(),
    )
-
+    with Model.define_operators({">>": chain, "|": concatenate}):
        nO_double = nO * 2 if nO else None
        if exclusive_classes:
            output_layer = Softmax(nO=nO, nI=nO_double)
        else:
            output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
-        model = (linear_model | cnn_model) >> output_layer
+        model = (linear_model | parametric_attention) >> output_layer
        model.set_ref("tok2vec", tok2vec)
    if model.has_dim("nO") is not False and nO is not None:
        model.set_dim("nO", cast(int, nO))
    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
-    model.set_ref("attention_layer", attention_layer)
-    model.set_ref("maxout_layer", maxout_layer)
-    model.set_ref("norm_layer", norm_layer)
    model.attrs["multi_label"] = not exclusive_classes

-    model.init = init_ensemble_textcat  # type: ignore[assignment]
-    return model
-
-
-def init_ensemble_textcat(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
    return model


@ -284,7 +256,9 @@ def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:

    tok2vec_width = get_tok2vec_width(model)
    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
+    if model.get_ref("key_transform").has_dim("nI") is None:
        model.get_ref("key_transform").set_dim("nI", tok2vec_width)
+    if model.get_ref("key_transform").has_dim("nO") is None:
        model.get_ref("key_transform").set_dim("nO", tok2vec_width)
    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -21,7 +21,7 @@ from thinc.types import Floats2d, Ints1d, Ints2d, Ragged

 from ...attrs import intify_attr
 from ...errors import Errors
-from ...ml import _character_embed
+from ...ml import character_embed
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@ -241,7 +241,7 @@ def CharacterEmbed(
    if feature is None:
        raise ValueError(Errors.E911.format(feat=feature))
    char_embed = chain(
-        _character_embed.CharacterEmbed(nM=nM, nC=nC),
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
        cast(Model[List[Floats2d], Ragged], list2ragged()),
    )
    feature_extractor: Model[List[Doc], Ragged] = chain(
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@ -40,16 +40,10 @@ cdef ActivationsC alloc_activations(SizesC n) nogil

 cdef void free_activations(const ActivationsC* A) nogil

-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+                         const WeightsC* W, SizesC n) nogil

 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil

-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil
+cdef void cpu_log_loss(float* d_scores, const float* costs,
+                       const int* is_valid, const float* scores, int O) nogil
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@ -5,11 +5,10 @@ from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
-from thinc.backends.linalg cimport Vec, VecVec

 import numpy
 import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps
+from thinc.api import CupyOps, Model, NumpyOps, get_ops

 from .. import util
 from ..errors import Errors
@ -79,66 +78,48 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
        A._max_size = n.states
    else:
-        A.token_ids = <int*>realloc(
-            A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
-        )
-        A.scores = <float*>realloc(
-            A.scores, n.states * n.classes * sizeof(A.scores[0])
-        )
-        A.unmaxed = <float*>realloc(
-            A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
-        )
-        A.hiddens = <float*>realloc(
-            A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
-        )
-        A.is_valid = <int*>realloc(
-            A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
-        )
+        A.token_ids = <int*>realloc(A.token_ids,
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
+        A.scores = <float*>realloc(A.scores,
+                                   n.states * n.classes * sizeof(A.scores[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
        A._max_size = n.states
    A._curr_size = n.states


-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil:
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+                         const WeightsC* W, SizesC n) nogil:
    resize_activations(A, n)
    for i in range(n.states):
        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(
-        cblas,
-        A.unmaxed,
-        W.feat_weights,
-        A.token_ids,
-        n.states,
-        n.feats,
-        n.hiddens * n.pieces
-    )
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
+                       n.feats, n.hiddens * n.pieces)
    for i in range(n.states):
-        VecVec.add_i(
-            &A.unmaxed[i*n.hiddens*n.pieces],
-            W.feat_bias, 1.,
-            n.hiddens * n.pieces
-        )
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
        for j in range(n.hiddens):
            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            which = _arg_max(&A.unmaxed[index], n.pieces)
            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
    memset(A.scores, 0, n.states * n.classes * sizeof(float))
    if W.hidden_weights == NULL:
        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
    else:
        # Compute hidden-to-output
-        sgemm(cblas)(
-            False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes
-        )
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
+                     <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
+                     A.scores, n.classes)
        # Add bias
        for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
    # Set unseen classes to minimum value
    i = 0
    min_ = A.scores[0]
@ -151,15 +132,8 @@ cdef void predict_states(
                A.scores[i*n.classes+j] = min_


-cdef void sum_state_features(
-    CBlas cblas,
-    float* output,
-    const float* cached,
-    const int* token_ids,
-    int B,
-    int F,
-    int O
-) nogil:
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
+                             const int* token_ids, int B, int F, int O) nogil:
    cdef int idx, b, f
    cdef const float* feature
    padding = cached
@ -177,17 +151,13 @@ cdef void sum_state_features(
        token_ids += F


-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil:
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
+                       const float* scores, int O) nogil:
    """Do multi-label log loss"""
    cdef double max_, gmax, Z, gZ
    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
+    guess = _arg_max(scores, O)
+
    if best == -1 or guess == -1:
        # These shouldn't happen, but if they do, we want to make sure we don't
        # cause an OOB access.
@ -207,9 +177,8 @@ cdef void cpu_log_loss(
            d_scores[i] = exp(scores[i]-max_) / Z


-cdef int arg_max_if_gold(
-    const weight_t* scores, const weight_t* costs, const int* is_valid, int n
-) nogil:
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
+                         const int* is_valid, int n) nogil:
    # Find minimum cost
    cdef float cost = 1
    for i in range(n):
@ -234,16 +203,8 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no


 class ParserStepModel(Model):
-    def __init__(
-        self,
-        docs,
-        layers,
-        *,
-        has_upper,
-        unseen_classes=None,
-        train=True,
-        dropout=0.1
-    ):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+                 dropout=0.1):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.attrs["has_upper"] = has_upper
        self.attrs["dropout_rate"] = dropout
@ -304,10 +265,8 @@ class ParserStepModel(Model):
        return ids

    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if (
-            isinstance(self.state2vec.ops, CupyOps)
-            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
-        ):
+        if isinstance(self.state2vec.ops, CupyOps) \
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
            # Move token_ids and d_vector to GPU, asynchronously
            self.backprops.append((
                util.get_async(self.cuda_stream, token_ids),
@ -350,7 +309,7 @@ def step_forward(model: ParserStepModel, states, is_train):
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores  # no-cython-lint: E731
+        def get_d_vector(d_scores): return d_scores
    # If the class is unseen, make sure its score is minimum
    scores[:, model._class_mask == 0] = numpy.nanmin(scores)

@ -386,6 +345,7 @@ cdef class precompute_hiddens:
    cdef bint _is_synchronized
    cdef public object ops
    cdef public object numpy_ops
+    cdef public object _cpu_ops
    cdef np.ndarray _features
    cdef np.ndarray _cached
    cdef np.ndarray bias
@ -416,6 +376,7 @@ cdef class precompute_hiddens:
        self.nO = cached.shape[2]
        self.ops = lower_model.ops
        self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
        assert activation in (None, "relu", "maxout")
        self.activation = activation
        self._is_synchronized = False
@ -478,19 +439,13 @@ cdef class precompute_hiddens:
        # - Output from backward on GPU
        bp_hiddens = self._bp_hiddens

-        cdef CBlas cblas
-        if isinstance(self.ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = self.ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()

        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
-        sum_state_features(
-            cblas, <float*>state_vector.data,
-            feat_weights, &ids[0, 0],
-            token_ids.shape[0], self.nF, self.nO*self.nP
-        )
+        sum_state_features(cblas, <float*>state_vector.data,
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
+                           self.nF, self.nO*self.nP)
        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

@ -531,3 +486,15 @@ cdef class precompute_hiddens:
            return d_best.reshape((d_best.shape + (1,)))

        return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,27 +1,41 @@
 cimport numpy as np
-from cymem.cymem cimport Pool
-from libc.stdint cimport uint64_t
-from preshed.maps cimport PreshMap
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport shared_ptr
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector

 from .strings cimport StringStore
-from .structs cimport MorphAnalysisC
 from .typedefs cimport attr_t, hash_t


+cdef cppclass Feature:
+    hash_t field
+    hash_t value
+
+    __init__():
+        this.field = 0
+        this.value = 0
+
+
+cdef cppclass MorphAnalysisC:
+    hash_t key  
+    vector[Feature] features
+
+    __init__():
+        this.key = 0
+
 cdef class Morphology:
-    cdef readonly Pool mem
    cdef readonly StringStore strings
-    cdef PreshMap tags  # Keyed by hash, value is pointer to tag
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags

-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)

-
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
-cdef list list_features(const MorphAnalysisC* morph)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(
-    attr_t* results,
-    const MorphAnalysisC* morph,
-    attr_t field,
-) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,14 +1,15 @@
 # cython: infer_types
 # cython: profile=False
 import warnings
+from typing import Dict, List, Optional, Tuple, Union

 import numpy

-from .attrs cimport POS
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr

 from . import symbols
 from .errors import Warnings
-from .parts_of_speech import IDS as POS_IDS


 cdef class Morphology:
@ -26,135 +27,185 @@ cdef class Morphology:
    EMPTY_MORPH = symbols.NAMES[symbols._]

    def __init__(self, StringStore strings):
-        self.mem = Pool()
        self.strings = strings
-        self.tags = PreshMap()

    def __reduce__(self):
        tags = set([self.get(self.strings[s]) for s in self.strings])
        tags -= set([""])
        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)

-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
+        match = self.tags.find(tag_hash)
+        if match != self.tags.const_end():
+            return deref(match).second
+        else:
+            return shared_ptr[MorphAnalysisC]()
+
+    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
+        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
+            attr_key = self.strings.as_string(attr_key)
+            attr_value = self.strings.as_string(attr_value)
+
+            # Preserve multiple values as a list
+            if self.VALUE_SEP in attr_value:
+                values = attr_value.split(self.VALUE_SEP)
+                values.sort()
+                attr_value = values
+        else:
+            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
+            return None
+
+        return attr_key, attr_value
+
+    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
+        if not feats or feats == self.EMPTY_MORPH:
+            return {}
+
+        out = []
+        for feat in feats.split(self.FEATURE_SEP):
+            field, values = feat.split(self.FIELD_SEP, 1)
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
+        out = []
+        for field, values in feats.items():
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
+        norm_feats_string = self.FEATURE_SEP.join([
+            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            for field, values in feats.items()
+            ])
+        return norm_feats_string or self.EMPTY_MORPH
+
+    cdef hash_t _add(self, features):
        """Insert a morphological analysis in the morphology table, if not
        already present. The morphological analysis may be provided in the UD
        FEATS format as a string or in the tag map dict format.
        Returns the hash of the new analysis.
        """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
        if isinstance(features, str):
            if features == "":
                features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
-            if tag_ptr != NULL:
-                return tag_ptr.key
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # intified ("Field", "Field=Value") pairs
-        field_feature_pairs = []
-        for field in sorted(string_features):
-            values = string_features[field]
-            for value in values.split(self.VALUE_SEP):
-                field_feature_pairs.append((
-                    self.strings.add(field),
-                    self.strings.add(field + self.FIELD_SEP + value),
-                ))
-        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+
        # the hash key for the tag is either the hash of the normalized UFEATS
        # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
-        tag.key = self.strings.add(norm_feats_string)
-        self.insert(tag)
-        return tag.key
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
+        tag_hash = self.strings.add(norm_feats_string)
+        tag = self._lookup_tag(tag_hash)
+        if tag:
+            return deref(tag).key

-    def normalize_features(self, features):
+        self._intern_morph_tag(tag_hash, features)
+        return tag_hash
+
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
+        # intified ("Field", "Field=Value") pairs where fields with multiple values have
+        # been split into individual tuples, e.g.:
+        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        # ("Field2", "Field2=Value3")]
+        field_feature_pairs = []
+
+        # Feat dict is normalized at this point.
+        for field, values in feats.items():
+            field_key = self.strings.add(field)
+            if isinstance(values, list):
+                for value in values:
+                    value_key = self.strings.add(field + self.FIELD_SEP + value)
+                    field_feature_pairs.append((field_key, value_key))
+            else:
+                # We could box scalar values into a list and use a common
+                # code path to generate features but that incurs a small
+                # but measurable allocation/iteration overhead (as this
+                # branch is taken often enough).
+                value_key = self.strings.add(field + self.FIELD_SEP + values)
+                field_feature_pairs.append((field_key, value_key))
+
+        num_features = len(field_feature_pairs)
+        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
+        deref(tag).key = tag_key
+        deref(tag).features.resize(num_features)
+
+        for i in range(num_features):
+            deref(tag).features[i].field = field_feature_pairs[i][0]
+            deref(tag).features[i].value = field_feature_pairs[i][1]
+
+        self.tags[tag_key] = tag
+
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
+    cdef str _normalize_features(self, features):
        """Create a normalized FEATS string from a features string or dict.

        features (Union[dict, str]): Features as dict or UFEATS string.
        RETURNS (str): Features as normalized UFEATS string.
        """
        if isinstance(features, str):
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        features = self.normalize_attrs(features)
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(
-            sorted(
-                [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
-            )
-        )
-        return norm_feats_string or self.EMPTY_MORPH

-    def normalize_attrs(self, attrs):
-        """Convert attrs dict so that POS is always by ID, other features are
-        by string. Values separated by VALUE_SEP are sorted.
-        """
-        out = {}
-        attrs = dict(attrs)
-        for key, value in attrs.items():
-            # convert POS value to ID
-            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
-                if isinstance(value, str) and value.upper() in POS_IDS:
-                    value = POS_IDS[value.upper()]
-                elif isinstance(value, int) and value not in POS_IDS.values():
-                    warnings.warn(Warnings.W100.format(feature={key: value}))
-                    continue
-                out[POS] = value
-            # accept any string or ID fields and values and convert to strings
-            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-                key = self.strings.as_string(key)
-                value = self.strings.as_string(value)
-                # sort values
-                if self.VALUE_SEP in value:
-                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
-                out[key] = value
-            else:
-                warnings.warn(Warnings.W100.format(feature={key: value}))
-        return out
+        return self._normalized_feat_dict_to_str(features)

-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
-        """Creates a MorphAnalysisC from a list of intified
-        ("Field", "Field=Value") tuples where fields with multiple values have
-        been split into individual tuples, e.g.:
-        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
-        ("Field2", "Field2=Value3")]
-        """
-        cdef MorphAnalysisC tag
-        tag.length = len(field_feature_pairs)
-        if tag.length > 0:
-            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        for i, (field, feature) in enumerate(field_feature_pairs):
-            tag.fields[i] = field
-            tag.features[i] = feature
-        return tag
+    def add(self, features):
+        return self._add(features)

-    cdef int insert(self, MorphAnalysisC tag) except -1:
-        cdef hash_t key = tag.key
-        if self.tags.get(key) == NULL:
-            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_ptr[0] = tag
-            self.tags.set(key, <void*>tag_ptr)
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)

-    def get(self, hash_t morph):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
-        if tag == NULL:
-            return ""
-        else:
-            return self.strings[tag.key]
+    def normalize_features(self, features):
+        return self._normalize_features(features)

    @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out

    @staticmethod
    def dict_to_feats(feats_dict):
@ -163,34 +214,34 @@ cdef class Morphology:
        return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))


-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
    cdef int i
-    for i in range(morph.length):
-        if morph.features[i] == feature:
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
            return True
    return False


-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
    cdef int i
    features = []
-    for i in range(morph.length):
-        features.append(morph.features[i])
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
    return features


-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
    n = get_n_by_field(<uint64_t*>results.data, morph, field)
    return results[:n]


-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
    cdef int n_results = 0
    cdef int i
-    for i in range(morph.length):
-        if morph.fields[i] == field:
-            results[n_results] = morph.features[i]
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
            n_results += 1
    return n_results

--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -4,22 +4,22 @@ from . cimport symbols
 cpdef enum univ_pos_t:
    NO_TAG = 0
    ADJ = symbols.ADJ
-    ADP
-    ADV
-    AUX
-    CONJ
-    CCONJ  # U20
-    DET
-    INTJ
-    NOUN
-    NUM
-    PART
-    PRON
-    PROPN
-    PUNCT
-    SCONJ
-    SYM
-    VERB
-    X
-    EOL
-    SPACE
+    ADP = symbols.ADP
+    ADV = symbols.ADV
+    AUX = symbols.AUX
+    CONJ = symbols.CONJ
+    CCONJ = symbols.CCONJ  # U20
+    DET = symbols.DET
+    INTJ = symbols.INTJ
+    NOUN = symbols.NOUN
+    NUM = symbols.NUM
+    PART = symbols.PART
+    PRON = symbols.PRON
+    PROPN = symbols.PROPN
+    PUNCT = symbols.PUNCT
+    SCONJ = symbols.SCONJ
+    SYM = symbols.SYM
+    VERB = symbols.VERB
+    X = symbols.X
+    EOL = symbols.EOL
+    SPACE = symbols.SPACE
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,8 +1,7 @@
-from .attributeruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
-from .entityruler import EntityRuler
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
@ -25,7 +24,6 @@ __all__ = [
    "EditTreeLemmatizer",
    "EntityLinker",
    "EntityRecognizer",
-    "EntityRuler",
    "Morphologizer",
    "Lemmatizer",
    "MultiLabel_TextCategorizer",
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@ -1,7 +1,7 @@
 from ...typedefs cimport class_t, hash_t


-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1

 cdef int check_final_state(void* _state, void* extra_args) except -1
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@ -1,21 +1,18 @@
 # cython: infer_types=True
 import numpy

-from thinc.extra.search cimport Beam
-
-from thinc.extra.search import MaxViolation
-
-from thinc.extra.search cimport MaxViolation
-
 from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem

 from ...errors import Errors
+from .search cimport Beam, MaxViolation
+
+from .search import MaxViolation

 from .stateclass cimport StateC, StateClass


-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateC*>_dest
    src = <StateC*>_src
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -19,7 +19,7 @@ from .stateclass cimport StateClass

 from ...errors import Errors

-from thinc.extra.search cimport Beam
+from .search cimport Beam


 cdef weight_t MIN_SCORE = -90000
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,11 +1,10 @@
 # cython: profile=False
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector

 from collections import Counter

-from thinc.extra.search cimport Beam
-
 from ...tokens.doc cimport Doc

 from ...tokens.span import Span
@ -20,6 +19,7 @@ from ...training import split_bilu_label

 from ...training.example cimport Example
 from ._state cimport StateC
+from .search cimport Beam
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t

@ -47,9 +47,7 @@ MOVE_NAMES[OUT] = 'O'

 cdef struct GoldNERStateC:
    Transition* ner
-    SpanC* negs
-    int32_t length
-    int32_t nr_neg
+    vector[shared_ptr[SpanC]] negs


 cdef class BiluoGold:
@ -82,8 +80,6 @@ cdef GoldNERStateC create_gold_state(
        negs = []
    assert example.x.length > 0
    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
-    gs.nr_neg = len(negs)
    ner_ents, ner_tags = example.get_aligned_ents_and_ner()
    for i, ner_tag in enumerate(ner_tags):
        gs.ner[i] = moves.lookup_transition(ner_tag)
@ -97,8 +93,8 @@ cdef GoldNERStateC create_gold_state(
    # In order to handle negative samples, we need to maintain the full
    # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
    # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
-        gs.negs[i] = neg.c
+    for neg in negs:
+        gs.negs.push_back(neg.c)
    return gs


@ -413,6 +409,8 @@ cdef class Begin:
        cdef int g_act = gold.ner[b0].move
        cdef attr_t g_tag = gold.ner[b0].label

+        cdef shared_ptr[SpanC] span
+
        if g_act == MISSING:
            pass
        elif g_act == BEGIN:
@ -430,8 +428,8 @@ cdef class Begin:
            # be correct or not. However, we can at least tell whether we're
            # going to be opening an entity where there's only one possible
            # L.
-            for span in gold.negs[:gold.nr_neg]:
-                if span.label == label and span.start == b0:
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
                    cost += 1
                    break
        return cost
@ -572,8 +570,9 @@ cdef class Last:
        # If we have negative-example entities, integrate them into the objective,
        # by marking actions that close an entity that we know is incorrect
        # as costly.
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                cost += 1
                break
        return cost
@ -637,8 +636,9 @@ cdef class Unit:
        # This is fairly straight-forward for U- entities, as we have a single
        # action
        cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                cost += 1
                break
        return cost
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@ -0,0 +1,86 @@
+from cymem.cymem cimport Pool
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.pair cimport pair
+from libcpp.queue cimport priority_queue
+from libcpp.vector cimport vector
+
+from ...typedefs cimport class_t, hash_t, weight_t
+
+ctypedef pair[weight_t, size_t] Entry
+ctypedef priority_queue[Entry] Queue
+
+
+ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
+
+ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
+
+ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
+
+ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
+
+ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
+
+
+cdef struct _State:
+    void* content
+    class_t* hist
+    weight_t score
+    weight_t loss
+    int i
+    int t
+    bint is_done
+
+
+cdef class Beam:
+    cdef Pool mem
+    cdef class_t nr_class
+    cdef class_t width
+    cdef class_t size
+    cdef public weight_t min_density
+    cdef int t
+    cdef readonly bint is_done
+    cdef list histories
+    cdef list _parent_histories
+    cdef weight_t** scores
+    cdef int** is_valid
+    cdef weight_t** costs
+    cdef _State* _parents
+    cdef _State* _states
+    cdef del_func_t del_func
+
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
+
+    cdef inline void* at(self, int i) nogil:
+        return self._states[i].content
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
+
+    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
+        self.scores[i][j] = score
+        self.is_valid[i][j] = is_valid
+        self.costs[i][j] = cost
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
+
+
+cdef class MaxViolation:
+    cdef Pool mem
+    cdef weight_t cost
+    cdef weight_t delta
+    cdef readonly weight_t p_score
+    cdef readonly weight_t g_score
+    cdef readonly double Z
+    cdef readonly double gZ
+    cdef class_t n
+    cdef readonly list p_hist
+    cdef readonly list g_hist
+    cdef readonly list p_probs
+    cdef readonly list g_probs
+
+    cpdef int check(self, Beam pred, Beam gold) except -1
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@ -0,0 +1,303 @@
+# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
+cimport cython
+from cymem.cymem cimport Pool
+from libc.math cimport exp
+from libc.string cimport memcpy, memset
+from preshed.maps cimport PreshMap
+
+
+cdef class Beam:
+    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
+        assert nr_class != 0
+        assert width != 0
+        self.nr_class = nr_class
+        self.width = width
+        self.min_density = min_density
+        self.size = 1
+        self.t = 0
+        self.mem = Pool()
+        self.del_func = NULL
+        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        cdef int i
+        self.histories = [[] for i in range(self.width)]
+        self._parent_histories = [[] for i in range(self.width)]
+
+        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        for i in range(self.width):
+            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
+            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+
+    def __len__(self):
+        return self.size
+
+    property score:
+        def __get__(self):
+            return self._states[0].score
+
+    property min_score:
+        def __get__(self):
+            return self._states[self.size-1].score
+
+    property loss:
+        def __get__(self):
+            return self._states[0].loss
+
+    property probs:
+        def __get__(self):
+            return _softmax([self._states[i].score for i in range(self.size)])
+
+    property scores:
+        def __get__(self):
+            return [self._states[i].score for i in range(self.size)]
+
+    property histories:
+        def __get__(self):
+            return self.histories
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1:
+        cdef int j
+        for j in range(self.nr_class):
+            self.scores[i][j] = scores[j]
+            self.is_valid[i][j] = is_valid[j]
+            self.costs[i][j] = costs[j]
+
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
+        cdef int i
+        for i in range(self.width):
+            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
+            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
+            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
+        for i in range(self.width):
+            self._states[i].content = init_func(self.mem, n, extra_args)
+            self._parents[i].content = init_func(self.mem, n, extra_args)
+        self.del_func = del_func
+
+    def __dealloc__(self):
+        if self.del_func == NULL:
+            return
+
+        for i in range(self.width):
+            self.del_func(self.mem, self._states[i].content, NULL)
+            self.del_func(self.mem, self._parents[i].content, NULL)
+
+    @cython.cdivision(True)
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1:
+        cdef weight_t** scores = self.scores
+        cdef int** is_valid = self.is_valid
+        cdef weight_t** costs = self.costs
+
+        cdef Queue* q = new Queue()
+        self._fill(q, scores, is_valid)
+        # For a beam of width k, we only ever need 2k state objects. How?
+        # Each transition takes a parent and a class and produces a new state.
+        # So, we don't need the whole history --- just the parent. So at
+        # each step, we take a parent, and apply one or more extensions to
+        # it.
+        self._parents, self._states = self._states, self._parents
+        self._parent_histories, self.histories = self.histories, self._parent_histories
+        cdef weight_t score
+        cdef int p_i
+        cdef int i = 0
+        cdef class_t clas
+        cdef _State* parent
+        cdef _State* state
+        cdef hash_t key
+        cdef PreshMap seen_states = PreshMap(self.width)
+        cdef uint64_t is_seen
+        cdef uint64_t one = 1
+        while i < self.width and not q.empty():
+            data = q.top()
+            p_i = data.second / self.nr_class
+            clas = data.second % self.nr_class
+            score = data.first
+            q.pop()
+            parent = &self._parents[p_i]
+            # Indicates terminal state reached; i.e. state is done
+            if parent.is_done:
+                # Now parent will not be changed, so we don't have to copy.
+                # Once finished, should also be unbranching.
+                self._states[i], parent[0] = parent[0], self._states[i]
+                parent.i = self._states[i].i
+                parent.t = self._states[i].t
+                parent.is_done = self._states[i].t
+                self._states[i].score = score
+                self.histories[i] = list(self._parent_histories[p_i])
+                i += 1
+            else:
+                state = &self._states[i]
+                # The supplied transition function should adjust the destination
+                # state to be the result of applying the class to the source state
+                transition_func(state.content, parent.content, clas, extra_args)
+                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
+                is_seen = <uint64_t>seen_states.get(key)
+                if key == 0 or key == 1 or not is_seen:
+                    if key != 0 and key != 1:
+                        seen_states.set(key, <void*>one)
+                    state.score = score
+                    state.loss = parent.loss + costs[p_i][clas]
+                    self.histories[i] = list(self._parent_histories[p_i])
+                    self.histories[i].append(clas)
+                    i += 1
+        del q
+        self.size = i
+        assert self.size >= 1
+        for i in range(self.width):
+            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
+        self.t += 1
+
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
+        cdef int i
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self.is_done = False
+                break
+        else:
+            self.is_done = True
+
+    @cython.cdivision(True)
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
+        """Populate the queue from a k * n matrix of scores, where k is the
+        beam-width, and n is the number of classes.
+        """
+        cdef Entry entry
+        cdef _State* s
+        cdef int i, j, move_id
+        assert self.size >= 1
+        cdef vector[Entry] entries
+        for i in range(self.size):
+            s = &self._states[i]
+            move_id = i * self.nr_class
+            if s.is_done:
+                # Update score by path average, following TACL '13 paper.
+                if self.histories[i]:
+                    entry.first = s.score + (s.score / self.t)
+                else:
+                    entry.first = s.score
+                entry.second = move_id
+                entries.push_back(entry)
+            else:
+                for j in range(self.nr_class):
+                    if is_valid[i][j]:
+                        entry.first = s.score + scores[i][j]
+                        entry.second = move_id + j
+                        entries.push_back(entry)
+        cdef double max_, Z, cutoff
+        if self.min_density == 0.0:
+            for i in range(entries.size()):
+                q.push(entries[i])
+        elif not entries.empty():
+            max_ = entries[0].first
+            Z = 0.
+            cutoff = 0.
+            # Softmax into probabilities, so we can prune
+            for i in range(entries.size()):
+                if entries[i].first > max_:
+                    max_ = entries[i].first
+            for i in range(entries.size()):
+                Z += exp(entries[i].first-max_)
+            cutoff = (1. / Z) * self.min_density
+            for i in range(entries.size()):
+                prob = exp(entries[i].first-max_) / Z
+                if prob >= cutoff:
+                    q.push(entries[i])
+
+
+cdef class MaxViolation:
+    def __init__(self):
+        self.p_score = 0.0
+        self.g_score = 0.0
+        self.Z = 0.0
+        self.gZ = 0.0
+        self.delta = -1
+        self.cost = 0
+        self.p_hist = []
+        self.g_hist = []
+        self.p_probs = []
+        self.g_probs = []
+
+    cpdef int check(self, Beam pred, Beam gold) except -1:
+        cdef _State* p = &pred._states[0]
+        cdef _State* g = &gold._states[0]
+        cdef weight_t d = p.score - g.score
+        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
+            self.cost = p.loss
+            self.delta = d
+            self.p_hist = list(pred.histories[0])
+            self.g_hist = list(gold.histories[0])
+            self.p_score = p.score
+            self.g_score = g.score
+            self.Z = 1e-10
+            self.gZ = 1e-10
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    self.Z += exp(pred._states[i].score)
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    prob = exp(gold._states[i].score)
+                    self.Z += prob
+                    self.gZ += prob
+
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
+        d = pred.score - gold.score
+        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
+        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
+            p_hist = []
+            p_scores = []
+            g_hist = []
+            g_scores = []
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    p_scores.append(pred._states[i].score)
+                    p_hist.append(list(pred.histories[i]))
+                # This can happen from non-monotonic actions
+                # If we find a better gold analysis this way, be sure to keep it.
+                elif pred._states[i].loss <= 0 \
+                        and tuple(pred.histories[i]) not in seen_golds:
+                    g_scores.append(pred._states[i].score)
+                    g_hist.append(list(pred.histories[i]))
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    g_scores.append(gold._states[i].score)
+                    g_hist.append(list(gold.histories[i]))
+
+            all_probs = _softmax(p_scores + g_scores)
+            p_probs = all_probs[:len(p_scores)]
+            g_probs_all = all_probs[len(p_scores):]
+            g_probs = _softmax(g_scores)
+
+            self.cost = pred.loss
+            self.delta = d
+            self.p_hist = p_hist
+            self.g_hist = g_hist
+            # TODO: These variables are misnamed! These are the gradients of the loss.
+            self.p_probs = p_probs
+            # Intuition here:
+            # The gradient of the loss is:
+            # P(model) - P(truth)
+            # Normally, P(truth) is 1 for the gold
+            # But, if we want to do the "partial credit" scheme, we want
+            # to create a distribution over the gold, proportional to the scores
+            # awarded.
+            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
+
+
+def _softmax(nums):
+    if not nums:
+        return []
+    max_ = max(nums)
+    nums = [(exp(n-max_) if n is not None else None) for n in nums]
+    Z = sum(n for n in nums if n is not None)
+    return [(n/Z if n is not None else None) for n in nums]
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -9,7 +9,6 @@ from collections import Counter
 import srsly

 from ...structs cimport TokenC
-from ...typedefs cimport attr_t, weight_t
 from .stateclass cimport StateClass

 from ... import util
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@ -10,7 +10,7 @@ from ..matcher import Matcher
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
-from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
+from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
 from ..training import Example
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -14,8 +14,11 @@ from ..scorer import Scorer
 from ..training import remove_bilu_prefix
 from ..util import registry
 from ._parser_internals import nonproj
+from ._parser_internals.arc_eager import ArcEager
 from ._parser_internals.nonproj import DELIMITER
+from ._parser_internals.transition_system import TransitionSystem
 from .functions import merge_subtokens
+from .transition_parser import Parser

 default_model_config = """
 [model]
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -1,11 +1,11 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast

 import numpy as np
 import srsly
 from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats2d, Ints1d

 from .. import util
 from ..errors import Errors
@ -18,6 +18,7 @@ from ._edit_tree_internals.schemas import validate_edit_tree
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe

+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 # The cutoff value of *top_k* above which an alternative method is used to process guesses.
 TOP_K_GUARDRAIL = 20

@ -50,6 +51,7 @@ DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["mo
        "overwrite": False,
        "top_k": 1,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+        "save_activations": False,
    },
    default_score_weights={"lemma_acc": 1.0},
 )
@ -62,6 +64,7 @@ def make_edit_tree_lemmatizer(
    overwrite: bool,
    top_k: int,
    scorer: Optional[Callable],
+    save_activations: bool,
 ):
    """Construct an EditTreeLemmatizer component."""
    return EditTreeLemmatizer(
@ -73,6 +76,7 @@ def make_edit_tree_lemmatizer(
        overwrite=overwrite,
        top_k=top_k,
        scorer=scorer,
+        save_activations=save_activations,
    )


@ -92,6 +96,7 @@ class EditTreeLemmatizer(TrainablePipe):
        overwrite: bool = False,
        top_k: int = 1,
        scorer: Optional[Callable] = lemmatizer_score,
+        save_activations: bool = False,
    ):
        """
        Construct an edit tree lemmatizer.
@ -103,6 +108,7 @@ class EditTreeLemmatizer(TrainablePipe):
            frequency in the training data.
        overwrite (bool): overwrite existing lemma annotations.
        top_k (int): try to apply at most the k most probable edit trees.
+        save_activations (bool): save model activations in Doc when annotating.
        """
        self.vocab = vocab
        self.model = model
@ -117,6 +123,7 @@ class EditTreeLemmatizer(TrainablePipe):

        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer
+        self.save_activations = save_activations
        self.numpy_ops = NumpyOps()

    def get_loss(
@ -146,7 +153,26 @@ class EditTreeLemmatizer(TrainablePipe):

        return float(loss), d_scores

-    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        if self.top_k == 1:
            scores2guesses = self._scores2guesses_top_k_equals_1
        elif self.top_k <= TOP_K_GUARDRAIL:
@ -163,14 +189,19 @@ class EditTreeLemmatizer(TrainablePipe):
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+            guesses: List[Ints1d] = [
+                self.model.ops.alloc((0,), dtype="i") for doc in docs
+            ]
+            scores: List[Floats2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
            assert len(guesses) == n_docs
-            return guesses
+            return {"probabilities": scores, "tree_ids": guesses}
        scores = self.model.predict(docs)
        assert len(scores) == n_docs
        guesses = scores2guesses(docs, scores)
        assert len(guesses) == n_docs
-        return guesses
+        return {"probabilities": scores, "tree_ids": guesses}

    def _scores2guesses_top_k_equals_1(self, docs, scores):
        guesses = []
@ -230,8 +261,13 @@ class EditTreeLemmatizer(TrainablePipe):

        return guesses

-    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+        batch_tree_ids = activations["tree_ids"]
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tree_ids = batch_tree_ids[i]
            if hasattr(doc_tree_ids, "get"):
                doc_tree_ids = doc_tree_ids.get()
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,27 +1,40 @@
 import random
-from itertools import islice
+import warnings
+from itertools import islice, tee
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Union,
+    cast,
+)

 import srsly
+from numpy import dtype
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged

 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..scorer import Scorer
-from ..tokens import Doc, Span
+from ..tokens import Doc, Span, SpanGroup
 from ..training import Example, validate_examples, validate_get_examples
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
-from .legacy.entity_linker import EntityLinker_v1
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe

-# See #9050
-BACKWARD_OVERWRITE = True
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
+
+KNOWLEDGE_BASE_IDS = "kb_ids"

 default_model_config = """
 [model]
@ -51,14 +64,13 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_prior": True,
        "incl_context": True,
        "entity_vector_length": 64,
-        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
-        "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
+        "get_candidates": {"@misc": "spacy.CandidateGenerator.v2"},
+        "overwrite": False,
        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
-        "overwrite": True,
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
-        "candidates_batch_size": 1,
        "threshold": None,
+        "save_activations": False,
    },
    default_score_weights={
        "nel_micro_f": 1.0,
@ -76,16 +88,15 @@ def make_entity_linker(
    incl_prior: bool,
    incl_context: bool,
    entity_vector_length: int,
-    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-    get_candidates_batch: Callable[
-        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    get_candidates: Callable[
+        [KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]
    ],
    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
    overwrite: bool,
    scorer: Optional[Callable],
    use_gold_ents: bool,
-    candidates_batch_size: int,
    threshold: Optional[float] = None,
+    save_activations: bool,
 ):
    """Construct an EntityLinker component.

@ -97,35 +108,20 @@ def make_entity_linker(
    incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
    incl_context (bool): Whether or not to include the local context in the model.
    entity_vector_length (int): Size of encoding vectors in the KB.
-    get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
-        produces a list of candidates, given a certain knowledge base and a textual mention.
-    get_candidates_batch (
-        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
-        ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+    get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
+        Function producing a list of candidates per document, given a certain knowledge base and several textual
+        documents with textual mentions.
    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
    scorer (Optional[Callable]): The scoring method.
    use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
        component must provide entity annotations.
-    candidates_batch_size (int): Size of batches for entity candidate generation.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
        prediction is discarded. If None, predictions are not filtered by any threshold.
+    save_activations (bool): save model activations in Doc when annotating.
    """
-
    if not model.attrs.get("include_span_maker", False):
-        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
-        return EntityLinker_v1(
-            nlp.vocab,
-            model,
-            name,
-            labels_discard=labels_discard,
-            n_sents=n_sents,
-            incl_prior=incl_prior,
-            incl_context=incl_context,
-            entity_vector_length=entity_vector_length,
-            get_candidates=get_candidates,
-            overwrite=overwrite,
-            scorer=scorer,
-        )
+        raise ValueError(Errors.E4005)
+
    return EntityLinker(
        nlp.vocab,
        model,
@ -136,13 +132,12 @@ def make_entity_linker(
        incl_context=incl_context,
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
-        get_candidates_batch=get_candidates_batch,
        generate_empty_kb=generate_empty_kb,
        overwrite=overwrite,
        scorer=scorer,
        use_gold_ents=use_gold_ents,
-        candidates_batch_size=candidates_batch_size,
        threshold=threshold,
+        save_activations=save_activations,
    )


@ -174,16 +169,16 @@ class EntityLinker(TrainablePipe):
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        get_candidates_batch: Callable[
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        get_candidates: Callable[
+            [KnowledgeBase, Iterator[SpanGroup]],
+            Iterator[Iterable[Iterable[Candidate]]],
        ],
        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
-        candidates_batch_size: int,
        threshold: Optional[float] = None,
+        save_activations: bool = False,
    ) -> None:
        """Initialize an entity linker.

@ -196,19 +191,17 @@ class EntityLinker(TrainablePipe):
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
-            produces a list of candidates, given a certain knowledge base and a textual mention.
-        get_candidates_batch (
-            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
-            Iterable[Candidate]]
-            ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+        get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
+            Function producing a list of candidates per document, given a certain knowledge base and several textual
+            documents with textual mentions.
        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
+        overwrite (bool): Whether to overwrite existing non-empty annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
-        candidates_batch_size (int): Size of batches for entity candidate generation.
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
+        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/entitylinker#init
        """

@ -230,16 +223,15 @@ class EntityLinker(TrainablePipe):
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
-        self.get_candidates_batch = get_candidates_batch
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
        self.use_gold_ents = use_gold_ents
-        self.candidates_batch_size = candidates_batch_size
        self.threshold = threshold
+        self.save_activations = save_activations

-        if candidates_batch_size < 1:
-            raise ValueError(Errors.E1044)
+        if self.incl_prior and not self.kb.supports_prior_probs:
+            warnings.warn(Warnings.W401)

        def _score_with_ents_set(examples: Iterable[Example], **kwargs):
            # Because of how spaCy works, we can't just score immediately, because Language.evaluate
@ -346,11 +338,12 @@ class EntityLinker(TrainablePipe):

        If one isn't present, then the update step needs to be skipped.
        """
-
-        for eg in examples:
-            for ent in eg.predicted.ents:
-                candidates = list(self.get_candidates(self.kb, ent))
-                if candidates:
+        for candidates_for_doc in self.get_candidates(
+            self.kb,
+            (SpanGroup(doc=eg.predicted, spans=eg.predicted.ents) for eg in examples),
+        ):
+            for candidates_for_mention in candidates_for_doc:
+                if list(candidates_for_mention):
                    return True

        return False
@ -442,7 +435,7 @@ class EntityLinker(TrainablePipe):
        loss = loss / len(entity_encodings)
        return float(loss), out

-    def predict(self, docs: Iterable[Doc]) -> List[str]:
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.
@ -455,40 +448,47 @@ class EntityLinker(TrainablePipe):
        self.validate_kb()
        entity_count = 0
        final_kb_ids: List[str] = []
-        xp = self.model.ops.xp
+        ops = self.model.ops
+        xp = ops.xp
+        docs_ents: List[Ragged] = []
+        docs_scores: List[Ragged] = []
        if not docs:
-            return final_kb_ids
+            return {
+                KNOWLEDGE_BASE_IDS: final_kb_ids,
+                "ents": docs_ents,
+                "scores": docs_scores,
+            }
        if isinstance(docs, Doc):
            docs = [docs]
-        for i, doc in enumerate(docs):
-            if len(doc) == 0:
+
+        docs_iters = tee(docs, 2)
+
+        # Call candidate generator.
+        all_ent_cands = self.get_candidates(
+            self.kb,
+            (
+                SpanGroup(
+                    doc,
+                    spans=[
+                        ent for ent in doc.ents if ent.label_ not in self.labels_discard
+                    ],
+                )
+                for doc in docs_iters[0]
+            ),
+        )
+
+        for doc in docs_iters[1]:
+            doc_ents: List[Ints1d] = []
+            doc_scores: List[Floats1d] = []
+            if len(doc) == 0 or len(doc.ents) == 0:
+                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                continue
            sentences = [s for s in doc.sents]
+            doc_ent_cands = list(next(all_ent_cands))

-            # Loop over entities in batches.
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
-                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
-                # Look up candidate entities.
-                valid_ent_idx = [
-                    idx
-                    for idx in range(len(ent_batch))
-                    if ent_batch[idx].label_ not in self.labels_discard
-                ]
-
-                batch_candidates = list(
-                    self.get_candidates_batch(
-                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
-                    )
-                    if self.candidates_batch_size > 1
-                    else [
-                        self.get_candidates(self.kb, ent_batch[idx])
-                        for idx in valid_ent_idx
-                    ]
-                )
-
-                # Looping through each entity in batch (TODO: rewrite)
-                for j, ent in enumerate(ent_batch):
+            # Looping over candidate entities for this doc. (TODO: rewrite)
+            for ent_cand_idx, ent in enumerate(doc.ents):
                assert hasattr(ent, "sents")
                sents = list(ent.sents)
                sent_indices = (
@ -506,7 +506,6 @@ class EntityLinker(TrainablePipe):
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    sent_doc = doc[start_token:end_token].as_doc()
-
                    # currently, the context is the same for each entity in a sentence (should be refined)
                    sentence_encoding = self.model.predict([sent_doc])[0]
                    sentence_encoding_t = sentence_encoding.T
@ -515,21 +514,41 @@ class EntityLinker(TrainablePipe):
                if ent.label_ in self.labels_discard:
                    # ignoring this entity - setting to NIL
                    final_kb_ids.append(self.NIL)
+                    self._add_activations(
+                        doc_scores=doc_scores,
+                        doc_ents=doc_ents,
+                        scores=[0.0],
+                        ents=[0],
+                    )
                else:
-                        candidates = list(batch_candidates[j])
+                    candidates = list(doc_ent_cands[ent_cand_idx])
                    if not candidates:
                        # no prediction possible for this entity - setting to NIL
                        final_kb_ids.append(self.NIL)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[0.0],
+                            ents=[0],
+                        )
                    elif len(candidates) == 1 and self.threshold is None:
                        # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
+                        final_kb_ids.append(candidates[0].entity_id_)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[1.0],
+                            ents=[candidates[0].entity_id],
+                        )
                    else:
                        random.shuffle(candidates)
                        # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
+                        scores = prior_probs = xp.asarray(
+                            [
+                                c.prior_prob if self.incl_prior else 0.0
+                                for c in candidates
+                            ]
+                        )
                        # add in similarity from the context
                        if self.incl_context:
                            entity_encodings = xp.asarray(
@ -551,33 +570,58 @@ class EntityLinker(TrainablePipe):
                                raise ValueError(Errors.E161)
                            scores = prior_probs + sims - (prior_probs * sims)
                        final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_
-                                if self.threshold is None
-                                or scores.max() >= self.threshold
+                            candidates[scores.argmax().item()].entity_id_
+                            if self.threshold is None or scores.max() >= self.threshold
                            else EntityLinker.NIL
                        )
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=scores,
+                            ents=[c.entity_id for c in candidates],
+                        )

+            self._add_doc_activations(
+                docs_scores=docs_scores,
+                docs_ents=docs_ents,
+                doc_scores=doc_scores,
+                doc_ents=doc_ents,
+            )
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
            )
            raise RuntimeError(err)
-        return final_kb_ids

-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+        return {
+            KNOWLEDGE_BASE_IDS: final_kb_ids,
+            "ents": docs_ents,
+            "scores": docs_scores,
+        }
+
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
+                                 by EntityLinker.predict.

        DOCS: https://spacy.io/api/entitylinker#set_annotations
        """
+        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
-        for doc in docs:
+        for j, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    if act_name != KNOWLEDGE_BASE_IDS:
+                        # We only copy activations that are Ragged.
+                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
@ -676,3 +720,32 @@ class EntityLinker(TrainablePipe):

    def add_label(self, label):
        raise NotImplementedError
+
+    def _add_doc_activations(
+        self,
+        *,
+        docs_scores: List[Ragged],
+        docs_ents: List[Ragged],
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+    def _add_activations(
+        self,
+        *,
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+        scores: Sequence[float],
+        ents: Sequence[int],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        doc_scores.append(ops.asarray1f(scores))
+        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -1,541 +0,0 @@
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import srsly
-
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..matcher import Matcher, PhraseMatcher
-from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import get_ner_prf
-from ..tokens import Doc, Span
-from ..training import Example
-from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
-from .pipe import Pipe
-
-DEFAULT_ENT_ID_SEP = "||"
-PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
-
-
-@Language.factory(
-    "entity_ruler",
-    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
-    default_config={
-        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
-        "validate": False,
-        "overwrite_ents": False,
-        "ent_id_sep": DEFAULT_ENT_ID_SEP,
-        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-    },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
-)
-def make_entity_ruler(
-    nlp: Language,
-    name: str,
-    phrase_matcher_attr: Optional[Union[int, str]],
-    matcher_fuzzy_compare: Callable,
-    validate: bool,
-    overwrite_ents: bool,
-    ent_id_sep: str,
-    scorer: Optional[Callable],
-):
-    return EntityRuler(
-        nlp,
-        name,
-        phrase_matcher_attr=phrase_matcher_attr,
-        matcher_fuzzy_compare=matcher_fuzzy_compare,
-        validate=validate,
-        overwrite_ents=overwrite_ents,
-        ent_id_sep=ent_id_sep,
-        scorer=scorer,
-    )
-
-
-def entity_ruler_score(examples, **kwargs):
-    return get_ner_prf(examples)
-
-
-@registry.scorers("spacy.entity_ruler_scorer.v1")
-def make_entity_ruler_scorer():
-    return entity_ruler_score
-
-
-class EntityRuler(Pipe):
-    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
-    rules or exact phrase matches. It can be combined with the statistical
-    `EntityRecognizer` to boost accuracy, or used on its own to implement a
-    purely rule-based entity recognition system. After initialization, the
-    component is typically added to the pipeline using `nlp.add_pipe`.
-
-    DOCS: https://spacy.io/api/entityruler
-    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
-    """
-
-    def __init__(
-        self,
-        nlp: Language,
-        name: str = "entity_ruler",
-        *,
-        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = levenshtein_compare,
-        validate: bool = False,
-        overwrite_ents: bool = False,
-        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
-        patterns: Optional[List[PatternType]] = None,
-        scorer: Optional[Callable] = entity_ruler_score,
-    ) -> None:
-        """Initialize the entity ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        name (str): Instance name of the current pipeline component. Typically
-            passed in automatically from the factory when the component is
-            added. Used to disable the current entity ruler while creating
-            phrase patterns with the nlp object.
-        phrase_matcher_attr (int / str): Token attribute to match on, passed
-            to the internal PhraseMatcher as `attr`.
-        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
-            internal Matcher. Defaults to
-            spacy.matcher.levenshtein.levenshtein_compare.
-        validate (bool): Whether patterns should be validated, passed to
-            Matcher and PhraseMatcher as `validate`
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        ent_id_sep (str): Separator used internally for entity IDs.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            spacy.scorer.get_ner_prf.
-
-        DOCS: https://spacy.io/api/entityruler#init
-        """
-        self.nlp = nlp
-        self.name = name
-        self.overwrite = overwrite_ents
-        self.token_patterns = defaultdict(list)  # type: ignore
-        self.phrase_patterns = defaultdict(list)  # type: ignore
-        self._validate = validate
-        self.matcher_fuzzy_compare = matcher_fuzzy_compare
-        self.matcher = Matcher(
-            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
-        )
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
-        self.ent_id_sep = ent_id_sep
-        self._ent_ids = defaultdict(tuple)  # type: ignore
-        if patterns is not None:
-            self.add_patterns(patterns)
-        self.scorer = scorer
-
-    def __len__(self) -> int:
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label: str) -> bool:
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-
-        DOCS: https://spacy.io/api/entityruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-
-        final_matches = set(
-            [(m_id, start, end) for m_id, start, end in matches if start != end]
-        )
-        get_sort_key = lambda m: (m[2] - m[1], -m[1])
-        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
-        return final_matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if match_id in self._ent_ids:
-                    label, ent_id = self._ent_ids[match_id]
-                    span = Span(doc, start, end, label=label, span_id=ent_id)
-                else:
-                    span = Span(doc, start, end, label=match_id)
-                new_entities.append(span)
-                entities = [
-                    e for e in entities if not (e.start < end and e.end > start)
-                ]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-
-        DOCS: https://spacy.io/api/entityruler#labels
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_labels = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                label, _ = self._split_label(l)
-                all_labels.add(label)
-            else:
-                all_labels.add(l)
-        return tuple(sorted(all_labels))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Sequence[PatternType]] = None,
-    ):
-        """Initialize the pipe for training.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        patterns Optional[Iterable[PatternType]]: The list of patterns.
-
-        DOCS: https://spacy.io/api/entityruler#initialize
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)  # type: ignore[arg-type]
-
-    @property
-    def ent_ids(self) -> Tuple[Optional[str], ...]:
-        """All entity ids present in the match patterns `id` properties
-
-        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#ent_ids
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_ent_ids = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                _, ent_id = self._split_label(l)
-                all_ent_ids.add(ent_id)
-        return tuple(all_ent_ids)
-
-    @property
-    def patterns(self) -> List[PatternType]:
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-
-        DOCS: https://spacy.io/api/entityruler#patterns
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern.text}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        return all_patterns
-
-    def add_patterns(self, patterns: List[PatternType]) -> None:
-        """Add patterns to the entity ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-
-        DOCS: https://spacy.io/api/entityruler#add_patterns
-        """
-
-        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-        try:
-            current_index = -1
-            for i, (name, pipe) in enumerate(self.nlp.pipeline):
-                if self == pipe:
-                    current_index = i
-                    break
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
-        except ValueError:
-            subsequent_pipes = []
-        with self.nlp.select_pipes(disable=subsequent_pipes):
-            token_patterns = []
-            phrase_pattern_labels = []
-            phrase_pattern_texts = []
-            phrase_pattern_ids = []
-            for entry in patterns:
-                if isinstance(entry["pattern"], str):
-                    phrase_pattern_labels.append(entry["label"])
-                    phrase_pattern_texts.append(entry["pattern"])
-                    phrase_pattern_ids.append(entry.get("id"))
-                elif isinstance(entry["pattern"], list):
-                    token_patterns.append(entry)
-            phrase_patterns = []
-            for label, pattern, ent_id in zip(
-                phrase_pattern_labels,
-                self.nlp.pipe(phrase_pattern_texts),
-                phrase_pattern_ids,
-            ):
-                phrase_pattern = {"label": label, "pattern": pattern}
-                if ent_id:
-                    phrase_pattern["id"] = ent_id
-                phrase_patterns.append(phrase_pattern)
-            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
-                label = entry["label"]  # type: ignore
-                if "id" in entry:
-                    ent_label = label
-                    label = self._create_label(label, entry["id"])
-                    key = self.matcher._normalize_key(label)
-                    self._ent_ids[key] = (ent_label, entry["id"])
-                pattern = entry["pattern"]  # type: ignore
-                if isinstance(pattern, Doc):
-                    self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])  # type: ignore
-                elif isinstance(pattern, list):
-                    self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
-                else:
-                    raise ValueError(Errors.E097.format(pattern=pattern))
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(
-            self.nlp.vocab,
-            validate=self._validate,
-            fuzzy_compare=self.matcher_fuzzy_compare,
-        )
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
-
-    def remove(self, ent_id: str) -> None:
-        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
-
-        ent_id (str): id of the pattern to be removed
-        RETURNS: None
-        DOCS: https://spacy.io/api/entityruler#remove
-        """
-        label_id_pairs = [
-            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
-        ]
-        if not label_id_pairs:
-            raise ValueError(
-                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
-            )
-        created_labels = [
-            self._create_label(label, eid) for (label, eid) in label_id_pairs
-        ]
-        # remove the patterns from self.phrase_patterns
-        self.phrase_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.phrase_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        self.token_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.token_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        for label in created_labels:
-            if label in self.phrase_matcher:
-                self.phrase_matcher.remove(label)
-            else:
-                self.matcher.remove(label)
-
-    def _require_patterns(self) -> None:
-        """Raise a warning if this component has no patterns defined."""
-        if len(self) == 0:
-            warnings.warn(Warnings.W036.format(name=self.name))
-
-    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
-        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
-
-        label (str): The value of label in a pattern entry
-        RETURNS (tuple): ent_label, ent_id
-        """
-        if self.ent_id_sep in label:
-            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
-        else:
-            ent_label = label
-            ent_id = None  # type: ignore
-        return ent_label, ent_id
-
-    def _create_label(self, label: Any, ent_id: Any) -> str:
-        """Join Entity label with ent_id if the pattern has an `id` attribute
-        If ent_id is not a string, the label is returned as is.
-
-        label (str): The label to set for ent.label_
-        ent_id (str): The label
-        RETURNS (str): The ent_label joined with configured `ent_id_sep`
-        """
-        if isinstance(ent_id, str):
-            label = f"{label}{self.ent_id_sep}{ent_id}"
-        return label
-
-    def from_bytes(
-        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_bytes
-        """
-        cfg = srsly.msgpack_loads(patterns_bytes)
-        self.clear()
-        if isinstance(cfg, dict):
-            self.add_patterns(cfg.get("patterns", cfg))
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab,
-                attr=self.phrase_matcher_attr,
-            )
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-        else:
-            self.add_patterns(cfg)
-        return self
-
-    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-
-        DOCS: https://spacy.io/api/entityruler#to_bytes
-        """
-        serial = {
-            "overwrite": self.overwrite,
-            "ent_id_sep": self.ent_id_sep,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "patterns": self.patterns,
-        }
-        return srsly.msgpack_dumps(serial)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (str / Path): The JSONL file to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_disk
-        """
-        path = ensure_path(path)
-        self.clear()
-        depr_patterns_path = path.with_suffix(".jsonl")
-        if path.suffix == ".jsonl":  # user provides a jsonl
-            if path.is_file:
-                patterns = srsly.read_jsonl(path)
-                self.add_patterns(patterns)
-            else:
-                raise ValueError(Errors.E1023.format(path=path))
-        elif depr_patterns_path.is_file():
-            patterns = srsly.read_jsonl(depr_patterns_path)
-            self.add_patterns(patterns)
-        elif path.is_dir():  # path is a valid directory
-            cfg = {}
-            deserializers_patterns = {
-                "patterns": lambda p: self.add_patterns(
-                    srsly.read_jsonl(p.with_suffix(".jsonl"))
-                )
-            }
-            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
-            from_disk(path, deserializers_cfg, {})
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
-            from_disk(path, deserializers_patterns, {})
-        else:  # path is not a valid directory or file
-            raise ValueError(Errors.E146.format(path=path))
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (str / Path): The JSONL file to save.
-
-        DOCS: https://spacy.io/api/entityruler#to_disk
-        """
-        path = ensure_path(path)
-        cfg = {
-            "overwrite": self.overwrite,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "ent_id_sep": self.ent_id_sep,
-        }
-        serializers = {
-            "patterns": lambda p: srsly.write_jsonl(
-                p.with_suffix(".jsonl"), self.patterns
-            ),
-            "cfg": lambda p: srsly.write_json(p, cfg),
-        }
-        if path.suffix == ".jsonl":  # user wants to save only JSONL
-            srsly.write_jsonl(path, self.patterns)
-        else:
-            to_disk(path, serializers, {})
--- a/spacy/pipeline/legacy/init.py
+++ b/spacy/pipeline/legacy/init.py
@ -1,3 +0,0 @@
-from .entity_linker import EntityLinker_v1
-
-__all__ = ["EntityLinker_v1"]
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@ -1,422 +0,0 @@
-# This file is present to provide a prior version of the EntityLinker component
-# for backwards compatability. For details see #9669.
-
-import random
-import warnings
-from itertools import islice
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
-
-import srsly
-from thinc.api import CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
-
-from ... import util
-from ...errors import Errors, Warnings
-from ...kb import Candidate, KnowledgeBase
-from ...language import Language
-from ...ml import empty_kb
-from ...scorer import Scorer
-from ...tokens import Doc, Span
-from ...training import Example, validate_examples, validate_get_examples
-from ...util import SimpleFrozenList
-from ...vocab import Vocab
-from ..pipe import deserialize_config
-from ..trainable_pipe import TrainablePipe
-
-# See #9050
-BACKWARD_OVERWRITE = True
-
-
-def entity_linker_score(examples, **kwargs):
-    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
-
-
-class EntityLinker_v1(TrainablePipe):
-    """Pipeline component for named entity linking.
-
-    DOCS: https://spacy.io/api/entitylinker
-    """
-
-    NIL = "NIL"  # string used to refer to a non-existing link
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        model: Model,
-        name: str = "entity_linker",
-        *,
-        labels_discard: Iterable[str],
-        n_sents: int,
-        incl_prior: bool,
-        incl_context: bool,
-        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        overwrite: bool = BACKWARD_OVERWRITE,
-        scorer: Optional[Callable] = entity_linker_score,
-    ) -> None:
-        """Initialize an entity linker.
-
-        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
-        name (str): The component instance name, used to add entries to the
-            losses during training.
-        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
-        n_sents (int): The number of neighbouring sentences to take into account.
-        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
-        incl_context (bool): Whether or not to include the local context in the model.
-        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
-            produces a list of candidates, given a certain knowledge base and a textual mention.
-        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
-        DOCS: https://spacy.io/api/entitylinker#init
-        """
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        self.labels_discard = list(labels_discard)
-        self.n_sents = n_sents
-        self.incl_prior = incl_prior
-        self.incl_context = incl_context
-        self.get_candidates = get_candidates
-        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
-        self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
-        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
-        self.kb = empty_kb(entity_vector_length)(self.vocab)
-        self.scorer = scorer
-
-    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
-        """Define the KB of this pipe by providing a function that will
-        create it using this object's vocab."""
-        if not callable(kb_loader):
-            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
-
-        self.kb = kb_loader(self.vocab)
-
-    def validate_kb(self) -> None:
-        # Raise an error if the knowledge base is not initialized.
-        if self.kb is None:
-            raise ValueError(Errors.E1018.format(name=self.name))
-        if len(self.kb) == 0:
-            raise ValueError(Errors.E139.format(name=self.name))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
-    ):
-        """Initialize the pipe for training, using a representative set
-        of data examples.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
-            Note that providing this argument, will overwrite all data accumulated in the current KB.
-            Use this only when loading a KB as-such from file.
-
-        DOCS: https://spacy.io/api/entitylinker#initialize
-        """
-        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
-        if kb_loader is not None:
-            self.set_kb(kb_loader)
-        self.validate_kb()
-        nO = self.kb.entity_vector_length
-        doc_sample = []
-        vector_sample = []
-        for example in islice(get_examples(), 10):
-            doc_sample.append(example.x)
-            vector_sample.append(self.model.ops.alloc1f(nO))
-        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(
-            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
-        )
-
-    def update(
-        self,
-        examples: Iterable[Example],
-        *,
-        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
-        losses: Optional[Dict[str, float]] = None,
-    ) -> Dict[str, float]:
-        """Learn from a batch of documents and gold-standard information,
-        updating the pipe's model. Delegates to predict and get_loss.
-
-        examples (Iterable[Example]): A batch of Example objects.
-        drop (float): The dropout rate.
-        sgd (thinc.api.Optimizer): The optimizer.
-        losses (Dict[str, float]): Optional record of the loss during training.
-            Updated using the component name as the key.
-        RETURNS (Dict[str, float]): The updated losses dictionary.
-
-        DOCS: https://spacy.io/api/entitylinker#update
-        """
-        self.validate_kb()
-        if losses is None:
-            losses = {}
-        losses.setdefault(self.name, 0.0)
-        if not examples:
-            return losses
-        validate_examples(examples, "EntityLinker_v1.update")
-        sentence_docs = []
-        for eg in examples:
-            sentences = [s for s in eg.reference.sents]
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                # KB ID of the first token is the same as the whole span
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    try:
-                        # find the sentence in the list of sentences.
-                        sent_index = sentences.index(ent.sent)
-                    except AttributeError:
-                        # Catch the exception when ent.sent is None and provide a user-friendly warning
-                        raise RuntimeError(Errors.E030) from None
-                    # get n previous sentences, if there are any
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    # get n posterior sentences, or as many < n as there are
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    # get token positions
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    # append that span as a doc to training
-                    sent_doc = eg.predicted[start_token:end_token].as_doc()
-                    sentence_docs.append(sent_doc)
-        set_dropout_rate(self.model, drop)
-        if not sentence_docs:
-            warnings.warn(Warnings.W093.format(name="Entity Linker"))
-            return losses
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
-        loss, d_scores = self.get_loss(
-            sentence_encodings=sentence_encodings, examples=examples
-        )
-        bp_context(d_scores)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss
-        return losses
-
-    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
-        validate_examples(examples, "EntityLinker_v1.get_loss")
-        entity_encodings = []
-        for eg in examples:
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    entity_encoding = self.kb.get_vector(kb_id)
-                    entity_encodings.append(entity_encoding)
-        entity_encodings = self.model.ops.asarray2f(entity_encodings)
-        if sentence_encodings.shape != entity_encodings.shape:
-            err = Errors.E147.format(
-                method="get_loss", msg="gold entities do not match up"
-            )
-            raise RuntimeError(err)
-        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
-        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
-        loss = loss / len(entity_encodings)
-        return float(loss), gradients
-
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
-        """Apply the pipeline's model to a batch of docs, without modifying them.
-        Returns the KB IDs for each entity in each doc, including NIL if there is
-        no prediction.
-
-        docs (Iterable[Doc]): The documents to predict.
-        RETURNS (List[str]): The models prediction for each document.
-
-        DOCS: https://spacy.io/api/entitylinker#predict
-        """
-        self.validate_kb()
-        entity_count = 0
-        final_kb_ids: List[str] = []
-        if not docs:
-            return final_kb_ids
-        if isinstance(docs, Doc):
-            docs = [docs]
-        for i, doc in enumerate(docs):
-            sentences = [s for s in doc.sents]
-            if len(doc) > 0:
-                # Looping through each entity (TODO: rewrite)
-                for ent in doc.ents:
-                    sent = ent.sent
-                    sent_index = sentences.index(sent)
-                    assert sent_index >= 0
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    xp = self.model.ops.xp
-                    if self.incl_context:
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
-                        final_kb_ids.append(self.NIL)
-                    else:
-                        candidates = list(self.get_candidates(self.kb, ent))
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
-                                    )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
-                                )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            best_index = scores.argmax().item()
-                            best_candidate = candidates[best_index]
-                            final_kb_ids.append(best_candidate.entity_)
-        if not (len(final_kb_ids) == entity_count):
-            err = Errors.E147.format(
-                method="predict", msg="result variables not of equal length"
-            )
-            raise RuntimeError(err)
-        return final_kb_ids
-
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
-        """Modify a batch of documents, using pre-computed scores.
-
-        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
-
-        DOCS: https://spacy.io/api/entitylinker#set_annotations
-        """
-        count_ents = len([ent for doc in docs for ent in doc.ents])
-        if count_ents != len(kb_ids):
-            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
-        i = 0
-        overwrite = self.cfg["overwrite"]
-        for doc in docs:
-            for ent in doc.ents:
-                kb_id = kb_ids[i]
-                i += 1
-                for token in ent:
-                    if token.ent_kb_id == 0 or overwrite:
-                        token.ent_kb_id_ = kb_id
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://spacy.io/api/entitylinker#to_bytes
-        """
-        self._validate_serialization_attrs()
-        serialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
-        serialize["kb"] = self.kb.to_bytes
-        serialize["model"] = self.model.to_bytes
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (TrainablePipe): The loaded object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_bytes
-        """
-        self._validate_serialization_attrs()
-
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
-        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
-        deserialize["model"] = load_model
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://spacy.io/api/entitylinker#to_disk
-        """
-        serialize = {}
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
-        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["kb"] = lambda p: self.kb.to_disk(p)
-        serialize["model"] = lambda p: self.model.to_disk(p)
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityLinker_v1":
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (EntityLinker): The modified EntityLinker object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_disk
-        """
-
-        def load_model(p):
-            try:
-                with p.open("rb") as infile:
-                    self.model.from_bytes(infile.read())
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize: Dict[str, Callable[[Any], Any]] = {}
-        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
-        deserialize["kb"] = lambda p: self.kb.from_disk(p)
-        deserialize["model"] = load_model
-        util.from_disk(path, deserialize, exclude)
-        return self
-
-    def rehearse(self, examples, *, sgd=None, losses=None, **config):
-        raise NotImplementedError
-
-    def add_label(self, label):
-        raise NotImplementedError
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -2,6 +2,7 @@ import warnings
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

+import srsly
 from thinc.api import Model

 from .. import util
@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
        """
        required_tables, optional_tables = self.get_lookups_config(self.mode)
        if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            logger.debug(
+                "Lemmatizer: no lemmatizer lookups tables provided, "
+                "trying to load tables from registered lookups (usually "
+                "spacy-lookups-data)"
+            )
+            lookups = load_lookups(
+                lang=self.vocab.lang, tables=required_tables, strict=False
+            )
+            missing_tables = set(required_tables) - set(lookups.tables)
+            if len(missing_tables) > 0:
+                raise ValueError(
+                    Errors.E4010.format(
+                        missing_tables=list(missing_tables),
+                        pipe_name=self.name,
+                        required_tables=srsly.json_dumps(required_tables),
+                        tables=srsly.json_dumps(required_tables + optional_tables),
+                    )
+                )
            optional_lookups = load_lookups(
                lang=self.vocab.lang, tables=optional_tables, strict=False
            )
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union

 from thinc.api import Config, Model, SequenceCategoricalCrossentropy

@ -15,7 +15,7 @@ from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
+from .tagger import ActivationsT, Tagger

 # See #9050
 BACKWARD_OVERWRITE = True
@ -50,8 +50,14 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
-                    "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "label_smoothing": 0.0,
+        "save_activations": False,
+    },
    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@ -62,8 +68,10 @@ def make_morphologizer(
    extend: bool,
    label_smoothing: float,
    scorer: Optional[Callable],
+    save_activations: bool,
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer,
+                         save_activations=save_activations)


 def morphologizer_score(examples, **kwargs):
@ -99,6 +107,7 @@ class Morphologizer(Tagger):
        extend: bool = BACKWARD_EXTEND,
        label_smoothing: float = 0.0,
        scorer: Optional[Callable] = morphologizer_score,
+        save_activations: bool = False,
    ):
        """Initialize a morphologizer.

@ -106,9 +115,12 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
+        extend (bool): Whether to extend existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attributes "pos" and "morph" and
            Scorer.score_token_attr_per_feat for the attribute "morph".
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/morphologizer#init
        """
@ -129,10 +141,11 @@ class Morphologizer(Tagger):
        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
        return tuple(self.cfg["labels_morph"].keys())

    @property
@ -156,7 +169,7 @@ class Morphologizer(Tagger):
        # normalize label
        norm_label = self.vocab.morphology.normalize_features(label)
        # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
        pos = label_dict.get(self.POS_FEAT, "")
        if self.POS_FEAT in label_dict:
            label_dict.pop(self.POS_FEAT)
@ -194,7 +207,7 @@ class Morphologizer(Tagger):
                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        morph_dict[self.POS_FEAT] = pos
                    norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -211,7 +224,7 @@ class Morphologizer(Tagger):
            for i, token in enumerate(example.reference):
                pos = token.pos_
                morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                if pos:
                    morph_dict[self.POS_FEAT] = pos
                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -222,39 +235,47 @@ class Morphologizer(Tagger):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.

        DOCS: https://spacy.io/api/morphologizer#set_annotations
        """
+        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
+        # We require random access for the upcoming ops, so we need
+        # to allocate a compatible container out of the iterable.
+        labels = tuple(self.labels)
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                # set morph
                if doc.c[j].morph == 0 or overwrite or extend:
                    if overwrite and extend:
                        # morphologizer morph overwrites any existing features
                        # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    elif extend:
                        # existing features are preserved and any new features
                        # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    else:
                        # clobber
@ -296,7 +317,7 @@ class Morphologizer(Tagger):
                    label = None
                # Otherwise, generate the combined label
                else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -1,215 +0,0 @@
-# cython: infer_types=True, binding=True
-from typing import Optional
-
-import numpy
-from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
-
-from ..attrs import ID
-from ..errors import Errors
-from ..language import Language
-from ..training import validate_examples
-from .tagger import Tagger
-from .trainable_pipe import TrainablePipe
-
-default_model_config = """
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v2"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-"""
-DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "nn_labeller",
-    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
-)
-def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
-    return MultitaskObjective(nlp.vocab, model, name)
-
-
-class MultitaskObjective(Tagger):
-    """Experimental: Assist training of a parser or tagger, by training a
-    side-objective.
-    """
-
-    def __init__(self, vocab, model, name="nn_labeller", *, target):
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        if target == "dep":
-            self.make_label = self.make_dep
-        elif target == "tag":
-            self.make_label = self.make_tag
-        elif target == "ent":
-            self.make_label = self.make_ent
-        elif target == "dep_tag_offset":
-            self.make_label = self.make_dep_tag_offset
-        elif target == "ent_tag":
-            self.make_label = self.make_ent_tag
-        elif target == "sent_start":
-            self.make_label = self.make_sent_start
-        elif hasattr(target, "__call__"):
-            self.make_label = target
-        else:
-            raise ValueError(Errors.E016)
-        cfg = {"labels": {}, "target": target}
-        self.cfg = dict(cfg)
-
-    @property
-    def labels(self):
-        return self.cfg.setdefault("labels", {})
-
-    @labels.setter
-    def labels(self, value):
-        self.cfg["labels"] = value
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None, labels=None):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-            raise ValueError(err)
-        if labels is not None:
-            self.labels = labels
-        else:
-            for example in get_examples():
-                for token in example.y:
-                    label = self.make_label(token)
-                    if label is not None and label not in self.labels:
-                        self.labels[label] = len(self.labels)
-        self.model.initialize()   # TODO: fix initialization by defining X and Y
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        scores = self.model.get_ref("softmax")(tokvecs)
-        return tokvecs, scores
-
-    def get_loss(self, examples, scores):
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        for i, eg in enumerate(examples):
-            # Handles alignment for tokenization differences
-            _doc_annots = eg.get_aligned()  # TODO
-            for j in range(len(eg.predicted)):
-                tok_annots = {key: values[j] for key, values in tok_annots.items()}
-                label = self.make_label(j, tok_annots)
-                if label is None or label not in self.labels:
-                    correct[idx] = guesses[idx]
-                else:
-                    correct[idx] = self.labels[label]
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        loss = (d_scores**2).sum()
-        return float(loss), d_scores
-
-    @staticmethod
-    def make_dep(token):
-        return token.dep_
-
-    @staticmethod
-    def make_tag(token):
-        return token.tag_
-
-    @staticmethod
-    def make_ent(token):
-        if token.ent_iob_ == "O":
-            return "O"
-        else:
-            return token.ent_iob_ + "-" + token.ent_type_
-
-    @staticmethod
-    def make_dep_tag_offset(token):
-        dep = token.dep_
-        tag = token.tag_
-        offset = token.head.i - token.i
-        offset = min(offset, 2)
-        offset = max(offset, -2)
-        return f"{dep}-{tag}:{offset}"
-
-    @staticmethod
-    def make_ent_tag(token):
-        if token.ent_iob_ == "O":
-            ent = "O"
-        else:
-            ent = token.ent_iob_ + "-" + token.ent_type_
-        tag = token.tag_
-        return f"{tag}-{ent}"
-
-    @staticmethod
-    def make_sent_start(token):
-        """A multi-task objective for representing sentence boundaries,
-        using BILU scheme. (O is impossible)
-        """
-        if token.is_sent_start and token.is_sent_end:
-            return "U-SENT"
-        elif token.is_sent_start:
-            return "B-SENT"
-        else:
-            return "I-SENT"
-
-
-class ClozeMultitask(TrainablePipe):
-    def __init__(self, vocab, model, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None):
-        self.model.initialize()  # TODO: fix initialization by defining X and Y
-        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.initialize(X)
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        vectors = self.model.get_ref("output_layer")(tokvecs)
-        return tokvecs, vectors
-
-    def get_loss(self, examples, vectors, prediction):
-        validate_examples(examples, "ClozeMultitask.get_loss")
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
-        target = vectors[ids]
-        gradient = self.distance.get_grad(prediction, target)
-        loss = self.distance.get_loss(prediction, target)
-        return float(loss), gradient
-
-    def update(self, examples, *, drop=0., sgd=None, losses=None):
-        pass
-
-    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
-        set_dropout_rate(self.model, drop)
-        validate_examples(examples, "ClozeMultitask.rehearse")
-        predictions, bp_predictions = self.model.begin_update()
-        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions)
-        if sgd is not None:
-            self.finish_update(sgd)
-        if losses is not None:
-            losses[self.name] += loss
-        return losses
-
-    def add_label(self, label):
-        raise NotImplementedError
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -4,6 +4,11 @@ from typing import Callable, Optional

 from thinc.api import Config, Model

+from ..language import Language
+from ..scorer import get_ner_prf
+from ..training import remove_bilu_prefix
+from ..util import registry
+from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem

 from ._parser_internals.ner cimport BiluoPushDown
@ -245,8 +250,11 @@ cdef class EntityRecognizer(Parser):
    def labels(self):
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        labels = set(remove_bilu_prefix(move) for move in self.move_names
-                     if move[0] in ("B", "I", "L", "U"))
+        labels = set(
+            remove_bilu_prefix(move)
+            for move in self.move_names
+            if move[0] in ("B", "I", "L", "U")
+        )
        return tuple(sorted(labels))

    def scored_ents(self, beams):
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True, binding=True
-import warnings
 from typing import Callable, Dict, Iterable, Iterator, Tuple, Union

 import srsly

 from ..tokens.doc cimport Doc

-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
@ -21,13 +20,6 @@ cdef class Pipe:
    DOCS: https://spacy.io/api/pipe
    """

-    @classmethod
-    def __init_subclass__(cls, **kwargs):
-        """Raise a warning if an inheriting class implements 'begin_training'
-         (from v2) instead of the new 'initialize' method (from v3)"""
-        if hasattr(cls, "begin_training"):
-            warnings.warn(Warnings.W088.format(name=cls.__name__))
-
    def __call__(self, Doc doc) -> Doc:
        """Apply the pipe to one document. The document is modified in place,
        and returned. This usually happens under the hood when the nlp object
@ -96,6 +88,10 @@ cdef class Pipe:
            return self.scorer(examples, **scorer_kwargs)
        return {}

+    @property
+    def is_distillable(self) -> bool:
+        return False
+
    @property
    def is_trainable(self) -> bool:
        return False
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -10,9 +10,6 @@ from ..language import Language
 from .pipe import Pipe
 from .senter import senter_score

-# see #9050
-BACKWARD_OVERWRITE = False
-

@Language.factory(
    "sentencizer",
@ -55,13 +52,14 @@ class Sentencizer(Pipe):
        name="sentencizer",
        *,
        punct_chars=None,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=senter_score,
    ):
        """Initialize the sentencizer.

        punct_chars (list): Punctuation characters to split on. Will be
            serialized with the nlp object.
+        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".

--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, binding=True
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional

 from thinc.api import Config, Model, SequenceCategoricalCrossentropy

@ -12,10 +12,7 @@ from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
-
-# See #9050
-BACKWARD_OVERWRITE = False
+from .tagger import ActivationsT, Tagger

 default_model_config = """
 [model]
@ -37,11 +34,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "senter",
    assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_SENTER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+        "save_activations": False,
+    },
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+def make_senter(nlp: Language,
+                name: str,
+                model: Model,
+                overwrite: bool,
+                scorer: Optional[Callable],
+                save_activations: bool):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)


 def senter_score(examples, **kwargs):
@ -69,8 +76,9 @@ class SentenceRecognizer(Tagger):
        model,
        name="senter",
        *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=senter_score,
+        save_activations: bool = False,
    ):
        """Initialize a sentence recognizer.

@ -78,8 +86,10 @@ class SentenceRecognizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/sentencerecognizer#init
        """
@ -89,6 +99,7 @@ class SentenceRecognizer(Tagger):
        self._rehearsal_model = None
        self.cfg = {"overwrite": overwrite}
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def labels(self):
@ -106,19 +117,24 @@ class SentenceRecognizer(Tagger):
    def label_data(self):
        return None

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.

        DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
        """
+        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@ -22,7 +22,7 @@ from ..errors import Errors, Warnings
 from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import Scorer
+from ..scorer import Scorer, get_ner_prf
 from ..tokens import Doc, Span
 from ..training import Example
 from ..util import SimpleFrozenList, ensure_path, registry
@ -33,7 +33,7 @@ DEFAULT_SPANS_KEY = "ruler"


@Language.factory(
-    "future_entity_ruler",
+    "entity_ruler",
    assigns=["doc.ents"],
    default_config={
        "phrase_matcher_attr": None,
@ -79,6 +79,15 @@ def make_entity_ruler(
    )


+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
@Language.factory(
    "span_ruler",
    assigns=["doc.spans"],
@ -136,7 +145,7 @@ def prioritize_new_ents_filter(
 ) -> List[Span]:
    """Merge entities and spans into one list without overlaps by allowing
    spans to overwrite any entities that they overlap with. Intended to
-    replicate the overwrite_ents=True behavior from the EntityRuler.
+    replicate the overwrite_ents=True behavior from the v3 EntityRuler.

    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
@ -167,7 +176,7 @@ def prioritize_existing_ents_filter(
 ) -> List[Span]:
    """Merge entities and spans into one list without overlaps by prioritizing
    existing entities. Intended to replicate the overwrite_ents=False behavior
-    from the EntityRuler.
+    from the v3 EntityRuler.

    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,12 +1,23 @@
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    cast,
+    runtime_checkable,
+)

 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
 from thinc.types import Floats2d, Ints1d, Ints2d, Ragged

-from ..compat import Protocol, runtime_checkable
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@ -16,6 +27,9 @@ from ..util import registry
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe

+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
 spancat_default_config = """
 [model]
@architectures = "spacy.SpanCategorizer.v1"
@ -170,6 +184,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
        "model": DEFAULT_SPANCAT_MODEL,
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+        "save_activations": False,
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -182,6 +197,7 @@ def make_spancat(
    scorer: Optional[Callable],
    threshold: float,
    max_positive: Optional[int],
+    save_activations: bool,
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component and configure it for multi-label
    classification to be able to assign multiple labels for each span.
@ -209,6 +225,7 @@ def make_spancat(
        0.5.
    max_positive (Optional[int]): Maximum number of labels to consider positive
        per span. Defaults to None, indicating no limit.
+    save_activations (bool): save model activations in Doc when annotating.
    """
    return SpanCategorizer(
        nlp.vocab,
@ -222,6 +239,7 @@ def make_spancat(
        threshold=threshold,
        scorer=scorer,
        add_negative_label=False,
+        save_activations=save_activations,
    )


@ -235,6 +253,7 @@ def make_spancat(
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
        "allow_overlap": True,
+        "save_activations": False,
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -247,6 +266,7 @@ def make_spancat_singlelabel(
    negative_weight: float,
    allow_overlap: bool,
    scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component and configure it for multi-class
    classification. With this configuration each span can get at most one
@ -274,6 +294,7 @@ def make_spancat_singlelabel(
    allow_overlap (bool): If True the data is assumed to contain overlapping spans.
        Otherwise it produces non-overlapping spans greedily prioritizing
        higher assigned label scores.
+    save_activations (bool): save model activations in Doc when annotating.
    """
    return SpanCategorizer(
        nlp.vocab,
@ -287,6 +308,7 @@ def make_spancat_singlelabel(
        add_negative_label=True,
        threshold=None,
        scorer=scorer,
+        save_activations=save_activations,
    )


@ -349,6 +371,7 @@ class SpanCategorizer(TrainablePipe):
        max_positive: Optional[int] = None,
        threshold: Optional[float] = 0.5,
        scorer: Optional[Callable] = spancat_score,
+        save_activations: bool = False,
    ) -> None:
        """Initialize the multi-label or multi-class span categorizer.

@ -398,6 +421,7 @@ class SpanCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self.scorer = scorer
+        self.save_activations = save_activations
        self.add_negative_label = add_negative_label
        if not allow_overlap and max_positive is not None and max_positive > 1:
            raise ValueError(Errors.E1051.format(max_positive=max_positive))
@ -479,7 +503,7 @@ class SpanCategorizer(TrainablePipe):
        else:
            return None

-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
@ -492,7 +516,7 @@ class SpanCategorizer(TrainablePipe):
            scores = self.model.ops.alloc2f(0, 0)
        else:
            scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        return {"indices": indices, "scores": scores}

    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@ -512,18 +536,27 @@ class SpanCategorizer(TrainablePipe):
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])

-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to set, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.

        DOCS: https://spacy.io/api/spancategorizer#set_annotations
        """
-        indices, scores = indices_scores
+        indices = activations["indices"]
+        assert isinstance(indices, Ragged)
+        scores = cast(Floats2d, activations["scores"])
        offset = 0
        for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
+            indices_i = cast(Ints2d, indices[i].dataXd)
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["indices"] = indices_i
+                doc.activations[self.name]["scores"] = scores[
+                    offset : offset + indices.lengths[i]
+                ]
+
            allow_overlap = cast(bool, self.cfg["allow_overlap"])
            if self.cfg["max_positive"] == 1:
                doc.spans[self.key] = self._make_span_group_singlelabel(
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,9 +1,10 @@
 # cython: infer_types=True, binding=True
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union

 import numpy
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
+from thinc.types import Floats2d, Ints1d

 from ..tokens.doc cimport Doc

@ -15,8 +16,7 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .trainable_pipe import TrainablePipe

-# See #9050
-BACKWARD_OVERWRITE = False
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]

 default_model_config = """
 [model]
@ -38,7 +38,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_TAGGER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+        "neg_prefix": "!",
+        "label_smoothing": 0.0,
+        "save_activations": False,
+    },
    default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@ -49,6 +56,7 @@ def make_tagger(
    scorer: Optional[Callable],
    neg_prefix: str,
    label_smoothing: float,
+    save_activations: bool,
 ):
    """Construct a part-of-speech tagger component.

@ -57,7 +65,8 @@ def make_tagger(
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+                  label_smoothing=label_smoothing, save_activations=save_activations)


 def tagger_score(examples, **kwargs):
@ -80,10 +89,11 @@ class Tagger(TrainablePipe):
        model,
        name="tagger",
        *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=tagger_score,
        neg_prefix="!",
        label_smoothing=0.0,
+        save_activations: bool = False,
    ):
        """Initialize a part-of-speech tagger.

@ -91,8 +101,10 @@ class Tagger(TrainablePipe):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "tag".
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/tagger#init
        """
@ -100,9 +112,15 @@ class Tagger(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
+        cfg = {
+            "labels": [],
+            "overwrite": overwrite,
+            "neg_prefix": neg_prefix,
+            "label_smoothing": label_smoothing
+        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def labels(self):
@ -121,7 +139,7 @@ class Tagger(TrainablePipe):
        """Data about the labels currently added to the component."""
        return tuple(self.cfg["labels"])

-    def predict(self, docs):
+    def predict(self, docs) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
@ -134,12 +152,12 @@ class Tagger(TrainablePipe):
            n_labels = len(self.labels)
            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
            assert len(guesses) == len(docs)
-            return guesses
+            return {"probabilities": guesses, "label_ids": guesses}
        scores = self.model.predict(docs)
        assert len(scores) == len(docs), (len(scores), len(docs))
        guesses = self._scores2guesses(scores)
        assert len(guesses) == len(docs)
-        return guesses
+        return {"probabilities": scores, "label_ids": guesses}

    def _scores2guesses(self, scores):
        guesses = []
@ -150,20 +168,25 @@ class Tagger(TrainablePipe):
            guesses.append(doc_guesses)
        return guesses

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.

        DOCS: https://spacy.io/api/tagger#set_annotations
        """
+        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        labels = self.labels
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
@ -219,7 +242,6 @@ class Tagger(TrainablePipe):

        DOCS: https://spacy.io/api/tagger#rehearse
        """
-        loss_func = SequenceCategoricalCrossentropy()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -233,12 +255,32 @@ class Tagger(TrainablePipe):
        set_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update(docs)
        tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
        bp_tag_scores(grads)
+        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        return losses

+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+
+        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
+        """
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
@ -250,7 +292,12 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(
+            names=self.labels,
+            normalize=False,
+            neg_prefix=self.cfg["neg_prefix"],
+            label_smoothing=self.cfg["label_smoothing"]
+        )
        # Convert empty tag "" to missing value None so that both misaligned
        # tokens and tokens with missing annotation have the default missing
        # value None.
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

 import numpy
 from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate
@ -14,6 +14,9 @@ from ..util import registry
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe

+ActivationsT = Dict[str, Floats2d]
+
+
 single_label_default_config = """
 [model]
@architectures = "spacy.TextCatEnsemble.v2"
@ -81,6 +84,7 @@ subword_features = true
        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
+        "save_activations": False,
    },
    default_score_weights={
        "cats_score": 1.0,
@ -101,6 +105,7 @@ def make_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -110,8 +115,16 @@ def make_textcat(
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
    scorer (Optional[Callable]): The scoring method.
+    save_activations (bool): save model activations in Doc when annotating.
    """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+    return TextCategorizer(
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
+    )


 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@ -142,6 +155,7 @@ class TextCategorizer(TrainablePipe):
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_score,
+        save_activations: bool = False,
    ) -> None:
        """Initialize a text categorizer for single-label classification.

@ -167,6 +181,7 @@ class TextCategorizer(TrainablePipe):
        }
        self.cfg = dict(cfg)
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def support_missing_values(self):
@ -191,7 +206,7 @@ class TextCategorizer(TrainablePipe):
        """
        return self.labels  # type: ignore[return-value]

-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
@ -204,12 +219,12 @@ class TextCategorizer(TrainablePipe):
            tensors = [doc.tensor for doc in docs]
            xp = self.model.ops.xp
            scores = xp.zeros((len(list(docs)), len(self.labels)))
-            return scores
+            return {"probabilities": scores}
        scores = self.model.predict(docs)
        scores = self.model.ops.asarray(scores)
-        return scores
+        return {"probabilities": scores}

-    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
@ -217,9 +232,13 @@ class TextCategorizer(TrainablePipe):

        DOCS: https://spacy.io/api/textcategorizer#set_annotations
        """
+        probs = activations["probabilities"]
        for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["probabilities"] = probs[i]
            for j, label in enumerate(self.labels):
-                doc.cats[label] = float(scores[i, j])
+                doc.cats[label] = float(probs[i, j])

    def update(
        self,
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union

 from thinc.api import Config, Model
 from thinc.types import Floats2d
@ -79,6 +79,7 @@ subword_features = true
        "threshold": 0.5,
        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
+        "save_activations": False,
    },
    default_score_weights={
        "cats_score": 1.0,
@ -99,8 +100,9 @@ def make_multilabel_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "MultiLabel_TextCategorizer":
-    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
    to be non-mutually exclusive, which means that there can be zero or more labels
    per doc).
@ -111,7 +113,12 @@ def make_multilabel_textcat(
    scorer (Optional[Callable]): The scoring method.
    """
    return MultiLabel_TextCategorizer(
-        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
    )


@ -143,6 +150,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_multilabel_score,
+        save_activations: bool = False,
    ) -> None:
        """Initialize a text categorizer for multi-label classification.

@ -152,6 +160,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
        scorer (Optional[Callable]): The scoring method.
+        save_activations (bool): save model activations in Doc when annotating.

        DOCS: https://spacy.io/api/textcategorizer#init
        """
@ -162,6 +171,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        cfg = {"labels": [], "threshold": threshold}
        self.cfg = dict(cfg)
        self.scorer = scorer
+        self.save_activations = save_activations

    @property
    def support_missing_values(self):
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,7 +1,8 @@
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple

 from thinc.api import Config, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d

 from ..errors import Errors
 from ..language import Language
@ -158,39 +159,9 @@ class Tok2Vec(TrainablePipe):

        DOCS: https://spacy.io/api/tok2vec#update
        """
-        if losses is None:
-            losses = {}
        validate_examples(examples, "Tok2Vec.update")
        docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
-        tokvecs, bp_tokvecs = self.model.begin_update(docs)
-        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-        losses.setdefault(self.name, 0.0)
-
-        def accumulate_gradient(one_d_tokvecs):
-            """Accumulate tok2vec loss and gradient. This is passed as a callback
-            to all but the last listener. Only the last one does the backprop.
-            """
-            nonlocal d_tokvecs
-            for i in range(len(one_d_tokvecs)):
-                d_tokvecs[i] += one_d_tokvecs[i]
-                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
-            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-
-        def backprop(one_d_tokvecs):
-            """Callback to actually do the backprop. Passed to last listener."""
-            accumulate_gradient(one_d_tokvecs)
-            d_docs = bp_tokvecs(d_tokvecs)
-            if sgd is not None:
-                self.finish_update(sgd)
-            return d_docs
-
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners[:-1]:
-            listener.receive(batch_id, tokvecs, accumulate_gradient)
-        if self.listeners:
-            self.listeners[-1].receive(batch_id, tokvecs, backprop)
-        return losses
+        return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)

    def get_loss(self, examples, scores) -> None:
        pass
@ -220,6 +191,96 @@ class Tok2Vec(TrainablePipe):
    def add_label(self, label):
        raise NotImplementedError

+    def distill(
+        self,
+        teacher_pipe: Optional["TrainablePipe"],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ) -> Dict[str, float]:
+        """Performs an update of the student pipe's model using the
+        student's distillation examples and sets the annotations
+        of the teacher's distillation examples using the teacher pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
+            for prediction.
+        examples (Iterable[Example]): Distillation examples. The reference (teacher)
+            and predicted (student) docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/tok2vec#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        teacher_docs = [eg.reference for eg in examples]
+        student_docs = [eg.predicted for eg in examples]
+        teacher_preds = teacher_pipe.predict(teacher_docs)
+        teacher_pipe.set_annotations(teacher_docs, teacher_preds)
+        return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
+
+    def _update_with_docs(
+        self,
+        docs: Iterable[Doc],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        set_dropout_rate(self.model, drop)
+
+        tokvecs, accumulate_gradient, backprop = self._create_backprops(
+            docs, losses, sgd=sgd
+        )
+        batch_id = Tok2VecListener.get_batch_id(docs)
+        for listener in self.listeners[:-1]:
+            listener.receive(batch_id, tokvecs, accumulate_gradient)
+        if self.listeners:
+            self.listeners[-1].receive(batch_id, tokvecs, backprop)
+        return losses
+
+    def _create_backprops(
+        self,
+        docs: Iterable[Doc],
+        losses: Dict[str, float],
+        *,
+        sgd: Optional[Optimizer] = None,
+    ) -> Tuple[Floats2d, Callable, Callable]:
+        tokvecs, bp_tokvecs = self.model.begin_update(docs)
+        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def accumulate_gradient(one_d_tokvecs):
+            """Accumulate tok2vec loss and gradient. This is passed as a callback
+            to all but the last listener. Only the last one does the backprop.
+            """
+            nonlocal d_tokvecs
+            for i in range(len(one_d_tokvecs)):
+                d_tokvecs[i] += one_d_tokvecs[i]
+                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
+            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def backprop(one_d_tokvecs):
+            """Callback to actually do the backprop. Passed to last listener."""
+            accumulate_gradient(one_d_tokvecs)
+            d_docs = bp_tokvecs(d_tokvecs)
+            if sgd is not None:
+                self.finish_update(sgd)
+            return d_docs
+
+        return tokvecs, accumulate_gradient, backprop
+

 class Tok2VecListener(Model):
    """A layer that gets fed its answers from an upstream connection,
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe):
    cdef public object model
    cdef public object cfg
    cdef public object scorer
+    cdef bint _save_activations
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -9,7 +9,7 @@ from ..tokens.doc cimport Doc
 from .. import util
 from ..errors import Errors
 from ..language import Language
-from ..training import Example, validate_examples
+from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
 from .pipe import Pipe, deserialize_config

@ -55,6 +55,53 @@ cdef class TrainablePipe(Pipe):
        except Exception as e:
            error_handler(self.name, self, [doc], e)

+    def distill(self,
+                teacher_pipe: Optional["TrainablePipe"],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ) -> Dict[str, float]:
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is typically trained on the probability
+        distribution of the teacher, but details may differ per pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/pipe#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        validate_distillation_examples(examples, "TrainablePipe.distill")
+        set_dropout_rate(self.model, drop)
+        for node in teacher_pipe.model.walk():
+            if node.name == "softmax":
+                node.attrs["softmax_normalize"] = True
+        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        bp_student_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
@ -168,6 +215,19 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))

+    def get_teacher_student_loss(self, teacher_scores, student_scores):
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+
+        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
+
    def create_optimizer(self) -> Optimizer:
        """Create an optimizer for the pipeline component.

@ -204,6 +264,14 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))

+    @property
+    def is_distillable(self) -> bool:
+        # Normally a pipe overrides `get_teacher_student_loss` to implement
+        # distillation. In more exceptional cases, a pipe can provide its
+        # own `distill` implementation. If neither of these methods is
+        # overridden, the pipe does not implement distillation.
+        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
+
    @property
    def is_trainable(self) -> bool:
        return True
@ -342,3 +410,11 @@ cdef class TrainablePipe(Pipe):
        deserialize["model"] = load_model
        util.from_disk(path, deserialize, exclude)
        return self
+
+    @property
+    def save_activations(self):
+        return self._save_activations
+
+    @save_activations.setter
+    def save_activations(self, save_activations: bool):
+        self._save_activations = save_activations
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@ -12,19 +12,10 @@ cdef class Parser(TrainablePipe):
    cdef public object _rehearsal_model
    cdef readonly TransitionSystem moves
    cdef public object _multitasks
+    cdef object _cpu_ops

-    cdef void _parseC(
-        self,
-        CBlas cblas,
-        StateC** states,
-        WeightsC weights,
-        SizesC sizes
-    ) nogil
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+                      WeightsC weights, SizesC sizes) nogil

-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+                                 int nr_class, int batch_size) nogil
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -2,6 +2,8 @@
 # cython: profile=False
 from __future__ import print_function

+from typing import Dict, Iterable, List, Optional, Tuple
+
 cimport numpy as np
 from cymem.cymem cimport Pool

@ -16,7 +18,18 @@ import random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import CupyOps, NumpyOps, set_dropout_rate
+from thinc.api import (
+    CupyOps,
+    NumpyOps,
+    Optimizer,
+    SequenceCategoricalCrossentropy,
+    chain,
+    get_ops,
+    set_dropout_rate,
+    softmax_activation,
+    use_ops,
+)
+from thinc.types import Floats2d

 from ..ml.parser_model cimport (
    ActivationsC,
@ -37,9 +50,22 @@ from .trainable_pipe import TrainablePipe

 from ._parser_internals cimport _beam_utils

+from ._parser_internals import _beam_utils
+
+from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
+from ._parser_internals cimport _beam_utils
+from ._parser_internals.stateclass cimport StateC, StateClass
+from ._parser_internals.transition_system cimport Transition
+from .trainable_pipe cimport TrainablePipe
+
 from .. import util
 from ..errors import Errors
-from ..training import validate_examples, validate_get_examples
+from ..training import (
+    validate_distillation_examples,
+    validate_examples,
+    validate_get_examples,
+)
 from ._parser_internals import _beam_utils

 NUMPY_OPS = NumpyOps()
@ -135,6 +161,7 @@ cdef class Parser(TrainablePipe):

        self._rehearsal_model = None
        self.scorer = scorer
+        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops

    def __getnewargs_ex__(self):
        """This allows pickling the Parser and its keyword-only init arguments"""
@ -214,6 +241,121 @@ cdef class Parser(TrainablePipe):
        # Defined in subclasses, to avoid circular import
        raise NotImplementedError

+    def distill(self,
+                teacher_pipe: Optional[TrainablePipe],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ):
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is trained on the transition probabilities
+        of the teacher.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/dependencyparser#distill
+        """
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        validate_distillation_examples(examples, "TransitionParser.distill")
+
+        set_dropout_rate(self.model, drop)
+
+        student_docs = [eg.predicted for eg in examples]
+
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+
+        max_moves = self.cfg["update_with_oracle_cut_size"]
+        if max_moves >= 1:
+            # Chop sequences into lengths of this many words, to make the
+            # batch uniform length. Since we do not have a gold standard
+            # sequence, we use the teacher's predictions as the gold
+            # standard.
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+        else:
+            states = self.moves.init_batch(student_docs)
+
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(student_docs)
+
+        if sgd is not None:
+            self.finish_update(sgd)
+
+        losses[self.name] += loss
+
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
+        return losses
+
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+
+        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
+        """
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
    def init_multitask_objectives(self, get_examples, pipeline, **cfg):
        """Setup models for secondary objectives, to benefit from multi-task
        learning. This method is intended to be overridden by subclasses.
@ -273,12 +415,7 @@ cdef class Parser(TrainablePipe):
    def greedy_parse(self, docs, drop=0.):
        cdef vector[StateC*] states
        cdef StateClass state
-        ops = self.model.ops
-        cdef CBlas cblas
-        if isinstance(ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
        self._ensure_labels_are_added(docs)
        set_dropout_rate(self.model, drop)
        batch = self.moves.init_batch(docs)
@ -314,18 +451,16 @@ cdef class Parser(TrainablePipe):
        del model
        return list(batch)

-    cdef void _parseC(
-        self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
-    ) nogil:
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+                      WeightsC weights, SizesC sizes) nogil:
        cdef int i
        cdef vector[StateC*] unfinished
        cdef ActivationsC activations = alloc_activations(sizes)
        while sizes.states >= 1:
            predict_states(cblas, &activations, states, &weights, sizes)
            # Validate actions, argmax, take action.
-            self.c_transition_batch(
-                states, activations.scores, sizes.classes, sizes.states
-            )
+            self.c_transition_batch(states, activations.scores,
+                                    sizes.classes, sizes.states)
            for i in range(sizes.states):
                if not states[i].is_final():
                    unfinished.push_back(states[i])
@ -353,13 +488,8 @@ cdef class Parser(TrainablePipe):
        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
        return [state for state in states if not state.c.is_final()]

-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil:
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+                                 int nr_class, int batch_size) nogil:
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        with gil:
            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@ -497,16 +627,8 @@ cdef class Parser(TrainablePipe):
        del tutor
        return losses

-    def update_beam(
-        self,
-        examples,
-        *,
-        beam_width,
-        drop=0.,
-        sgd=None,
-        losses=None,
-        beam_density=0.0
-    ):
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
+                    losses=None, beam_density=0.0):
        states, golds, _ = self.moves.init_gold_batch(examples)
        if not states:
            return losses
@ -536,9 +658,8 @@ cdef class Parser(TrainablePipe):

        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros(
-            (len(states), self.moves.n_moves), dtype='f', order='C'
-        )
+        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
+                                               dtype='f', order='C')
        c_d_scores = <float*>d_scores.data
        unseen_classes = self.model.attrs["unseen_classes"]
        for i, (state, gold) in enumerate(zip(states, golds)):
@ -548,9 +669,8 @@ cdef class Parser(TrainablePipe):
            for j in range(self.moves.n_moves):
                if costs[j] <= 0.0 and j in unseen_classes:
                    unseen_classes.remove(j)
-            cpu_log_loss(
-                c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
-            )
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
+                         d_scores.shape[1])
            c_d_scores += d_scores.shape[1]
        # Note that we don't normalize this. See comment in update() for why.
        if losses is not None:
@ -652,6 +772,36 @@ cdef class Parser(TrainablePipe):
                    raise ValueError(Errors.E149) from None
        return self

+    def _init_batch(self, teacher_step_model, docs, max_length):
+        """Make a square batch of length equal to the shortest transition
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]. In contrast to
+        _init_gold_batch, this version uses a teacher model to generate the
+        cut sequences."""
+        cdef StateClass state
+        all_states = self.moves.init_batch(docs)
+        states = []
+        to_cut = []
+        for state, doc in zip(all_states, docs):
+            if not state.is_final():
+                if len(doc) < max_length:
+                    states.append(state)
+                else:
+                    to_cut.append(state)
+        while to_cut:
+            states.extend(state.copy() for state in to_cut)
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
+        return states
+
    def _init_gold_batch(self, examples, max_length):
        """Make a square batch, of length equal to the shortest transition
        sequence or a cap. A long
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -9,6 +9,7 @@ from typing import (
    Dict,
    Iterable,
    List,
+    Literal,
    Optional,
    Tuple,
    Type,
@ -48,7 +49,6 @@ from thinc.api import ConfigValidationError, Model, Optimizer
 from thinc.config import Promise

 from .attrs import NAMES
-from .compat import Literal
 from .lookups import Lookups
 from .util import is_cython_func

@ -181,7 +181,7 @@ def validate_init_settings(

 def validate_token_pattern(obj: list) -> List[str]:
    # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
-    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
    if isinstance(obj, list):
        converted = []
        for pattern in obj:
@ -461,6 +461,27 @@ class ConfigSchemaInit(BaseModel):
        arbitrary_types_allowed = True


+class ConfigSchemaDistillEmpty(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
+class ConfigSchemaDistill(BaseModel):
+    # fmt: off
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
+    max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
@ -468,6 +489,7 @@ class ConfigSchema(BaseModel):
    components: Dict[str, Dict[str, Any]]
    corpora: Dict[str, Reader]
    initialize: ConfigSchemaInit
+    distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]

    class Config:
        extra = "allow"
@ -479,6 +501,7 @@ CONFIG_SCHEMAS = {
    "training": ConfigSchemaTraining,
    "pretraining": ConfigSchemaPretrain,
    "initialize": ConfigSchemaInit,
+    "distillation": ConfigSchemaDistill,
 }

 # Recommendations for init config workflows
--- a/Show More
+++ b/Show More