Update cli.package for removed spacy.vectors.name attr

Merge pull request #13490 from svlandeg/feat/update_v4
Update v4 branch with latest from master
2025-10-02 18:06:46 +03:00 · 2024-09-01 16:43:49 +02:00 · 2024-05-14 22:41:17 +02:00 · 2024-05-14 18:45:51 +02:00 · 2024-05-14 18:38:11 +02:00 · 2024-05-14 17:42:48 +02:00
247 changed files with 7487 additions and 4544 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -30,7 +30,7 @@ jobs:
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
-          python-version: "3.7"
+          python-version: "3.9"
      - name: black
        run: |
@ -59,11 +59,7 @@ jobs:
        os: [ubuntu-latest, windows-latest, macos-latest]
        python_version: ["3.12"]
        include:
-          - os: windows-latest
+          - os: ubuntu-20.04
            python_version: "3.7"
          - os: macos-latest
            python_version: "3.8"
          - os: ubuntu-latest
            python_version: "3.9"
          - os: windows-latest
            python_version: "3.10"
@ -93,7 +89,6 @@ jobs:
      - name: Run mypy
        run: |
          python -m mypy spacy
        if: matrix.python_version != '3.7'
      - name: Delete source directory and .egg-info
        run: |
@ -115,22 +110,22 @@ jobs:
      - name: Test import
        run: python -W error -c "import spacy"
-      - name: "Test download CLI"
+      #      - name: "Test download CLI"
-        run: |
+      #        run: |
-          python -m spacy download ca_core_news_sm
+      #          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
+      #          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #        if: matrix.python_version == '3.9'
-
+      #
-      - name: "Test download_url in info CLI"
+      #      - name: "Test download_url in info CLI"
-        run: |
+      #        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
+      #        if: matrix.python_version == '3.9'
-
+      #
-      - name: "Test no warnings on load (#11713)"
+      #      - name: "Test no warnings on load (#11713)"
-        run: |
+      #        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #        if: matrix.python_version == '3.9'
      - name: "Test convert CLI"
        run: |
@ -154,17 +149,17 @@ jobs:
          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
        if: matrix.python_version == '3.9'
-      - name: "Test assemble CLI"
+      #      - name: "Test assemble CLI"
-        run: |
+      #        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
+      #        if: matrix.python_version == '3.9'
-
+      #
-      - name: "Test assemble CLI vectors warning"
+      #      - name: "Test assemble CLI vectors warning"
-        run: |
+      #        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+      #        if: matrix.python_version == '3.9'
      - name: "Install test requirements"
        run: |
@ -173,10 +168,4 @@ jobs:
      - name: "Run CPU tests"
        run: |
          python -m pytest --pyargs spacy -W error
-        if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
+        if: matrix.python_version == '3.11'
      - name: "Run CPU tests with thinc-apple-ops"
        run: |
          python -m pip install 'spacy[apple]'
          python -m pytest --pyargs spacy
        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@ -25,7 +25,7 @@ jobs:
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
-          python-version: "3.7"
+          python-version: "3.9"
      - name: Validate website/meta/universe.json
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,8 @@ repos:
    rev: 22.3.0
    hooks:
      - id: black
-      language_version: python3.7
+        language_version: python3.9
-      additional_dependencies: ['click==8.0.4']
+        additional_dependencies: ["click==8.0.4"]
  - repo: https://github.com/pycqa/flake8
    rev: 5.0.4
    hooks:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -276,7 +276,7 @@ except:  # noqa: E722
 ### Python conventions
-All Python code must be written **compatible with Python 3.6+**. More detailed
+All Python code must be written **compatible with Python 3.9+**. More detailed
 code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
 #### I/O and handling paths
--- a/2
+++ b/2
@ -5,7 +5,7 @@ override SPACY_EXTRAS = spacy-lookups-data==1.0.3
 endif
 ifndef PYVER
-override PYVER = 3.8
+override PYVER = 3.9
 endif
 VENV := ./env$(PYVER)
--- a/README.md
+++ b/README.md
@ -33,7 +33,7 @@ open-source software, released under the
 ## 📖 Documentation
 | Documentation                                                                                                                                                                                   |                                                                                                                                                                                                                                                                                                       |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | ⭐️ **[spaCy 101]**                                                                                                                                                                             | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                     |
 | 📚 **[Usage Guides]**                                                                                                                                                                           | How to use spaCy and its features.                                                                                                                                                                                                                                                                    |
 | 🚀 **[New in v3.0]**                                                                                                                                                                            | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                        |
@ -115,7 +115,7 @@ For detailed installation instructions, see the
 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
  Studio)
- **Python version**: Python 3.7+ (only 64 bit)
+- **Python version**: Python 3.9+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)
 [pip]: https://pypi.org/project/spacy/
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,6 +1,2 @@
 # build version constraints for use with wheelwright
 numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy>=1.25.0; python_version>='3.9'
--- a/extra/DEVELOPER_DOCS/Satellite
+++ b/extra/DEVELOPER_DOCS/Satellite
@ -31,7 +31,6 @@ These are repos that can be used by spaCy but aren&#39;t part of a default insta
 - [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford&#39;s Stanza library in spaCy.
 - [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
 - [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
 - [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple&#39;s native libraries for improved performance.
 - [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
 - [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.
@ -79,4 +78,3 @@ Repos that don&#39;t fit in any of the above categories.
 - [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
 - [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
 - [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
+    "thinc>=9.0.0,<9.1.0",
    "numpy>=1.15.0; python_version < '3.9'",
    "numpy>=1.25.0; python_version >= '3.9'",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,9 @@
 # Our libraries
-spacy-legacy>=3.0.11,<3.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
 typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
@ -31,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
--- a/setup.cfg
+++ b/setup.cfg
@ -17,8 +17,6 @@ classifiers =
    Operating System :: Microsoft :: Windows
    Programming Language :: Cython
    Programming Language :: Python :: 3
    Programming Language :: Python :: 3.7
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
@ -31,26 +29,15 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.7
+python_requires = >=3.9
 # NOTE: This section is superseded by pyproject.toml and will be removed in
 # spaCy v4
 setup_requires =
    cython>=0.25,<3.0
    numpy>=1.15.0; python_version < "3.9"
    numpy>=1.19.0; python_version >= "3.9"
    # We also need our Cython packages here to compile against
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
    thinc>=8.2.2,<8.3.0
 install_requires =
    # Our libraries
-    spacy-legacy>=3.0.11,<3.1.0
+    spacy-legacy>=4.0.0.dev1,<4.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
+    thinc>=9.0.0,<9.1.0
    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
@ -66,7 +53,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
@ -116,14 +102,12 @@ cuda12x =
    cupy-cuda12x>=11.5.0,<13.0.0
 cuda-autodetect =
    cupy-wheel>=11.0.0,<13.0.0
 apple =
    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.5.2,!=0.6.1
    sudachidict_core>=20211220
 ko =
-    natto-py>=0.9.0
+    mecab-ko>=1.0.0
 th =
    pythainlp>=2.0
--- a/setup.py
+++ b/setup.py
@ -37,7 +37,6 @@ MOD_NAMES = [
    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
    "spacy.pipeline.multitask",
    "spacy.pipeline.ner",
    "spacy.pipeline.pipe",
    "spacy.pipeline.trainable_pipe",
@ -48,6 +47,7 @@ MOD_NAMES = [
    "spacy.pipeline._parser_internals.arc_eager",
    "spacy.pipeline._parser_internals.ner",
    "spacy.pipeline._parser_internals.nonproj",
    "spacy.pipeline._parser_internals.search",
    "spacy.pipeline._parser_internals._state",
    "spacy.pipeline._parser_internals.stateclass",
    "spacy.pipeline._parser_internals.transition_system",
@ -61,12 +61,13 @@ MOD_NAMES = [
    "spacy.tokens.span_group",
    "spacy.tokens.graph",
    "spacy.tokens.morphanalysis",
-    "spacy.tokens._retokenize",
+    "spacy.tokens.retokenizer",
    "spacy.matcher.matcher",
    "spacy.matcher.phrasematcher",
    "spacy.matcher.dependencymatcher",
    "spacy.symbols",
    "spacy.vectors",
    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
    "msvc": ["/Ox", "/EHsc"],
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,9 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "4.0.0.dev3"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
 __projects_branch__ = "v3"
 __lookups_tag__ = "v1.0.3"
 __lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,99 +1,50 @@
 # Reserve 64 values for flag features
 from . cimport symbols
 cdef enum attr_id_t:
-    NULL_ATTR
+    NULL_ATTR = 0
-    IS_ALPHA
+    IS_ALPHA = symbols.IS_ALPHA
-    IS_ASCII
+    IS_ASCII = symbols.IS_ASCII
-    IS_DIGIT
+    IS_DIGIT = symbols.IS_DIGIT
-    IS_LOWER
+    IS_LOWER = symbols.IS_LOWER
-    IS_PUNCT
+    IS_PUNCT = symbols.IS_PUNCT
-    IS_SPACE
+    IS_SPACE = symbols.IS_SPACE
-    IS_TITLE
+    IS_TITLE = symbols.IS_TITLE
-    IS_UPPER
+    IS_UPPER = symbols.IS_UPPER
-    LIKE_URL
+    LIKE_URL = symbols.LIKE_URL
-    LIKE_NUM
+    LIKE_NUM = symbols.LIKE_NUM
-    LIKE_EMAIL
+    LIKE_EMAIL = symbols.LIKE_EMAIL
-    IS_STOP
+    IS_STOP = symbols.IS_STOP
-    IS_OOV_DEPRECATED
+    IS_BRACKET = symbols.IS_BRACKET
-    IS_BRACKET
+    IS_QUOTE = symbols.IS_QUOTE
-    IS_QUOTE
+    IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
-    IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
-    IS_RIGHT_PUNCT
+    IS_CURRENCY = symbols.IS_CURRENCY
    IS_CURRENCY
-    FLAG19 = 19
+    ID = symbols.ID
-    FLAG20
+    ORTH = symbols.ORTH
-    FLAG21
+    LOWER = symbols.LOWER
-    FLAG22
+    NORM = symbols.NORM
-    FLAG23
+    SHAPE = symbols.SHAPE
-    FLAG24
+    PREFIX = symbols.PREFIX
-    FLAG25
+    SUFFIX = symbols.SUFFIX
    FLAG26
    FLAG27
    FLAG28
    FLAG29
    FLAG30
    FLAG31
    FLAG32
    FLAG33
    FLAG34
    FLAG35
    FLAG36
    FLAG37
    FLAG38
    FLAG39
    FLAG40
    FLAG41
    FLAG42
    FLAG43
    FLAG44
    FLAG45
    FLAG46
    FLAG47
    FLAG48
    FLAG49
    FLAG50
    FLAG51
    FLAG52
    FLAG53
    FLAG54
    FLAG55
    FLAG56
    FLAG57
    FLAG58
    FLAG59
    FLAG60
    FLAG61
    FLAG62
    FLAG63
-    ID
+    LENGTH = symbols.LENGTH
-    ORTH
+    CLUSTER = symbols.CLUSTER
-    LOWER
+    LEMMA = symbols.LEMMA
-    NORM
+    POS = symbols.POS
-    SHAPE
+    TAG = symbols.TAG
-    PREFIX
+    DEP = symbols.DEP
-    SUFFIX
+    ENT_IOB = symbols.ENT_IOB
    ENT_TYPE = symbols.ENT_TYPE
    HEAD = symbols.HEAD
    SENT_START = symbols.SENT_START
    SPACY = symbols.SPACY
    PROB = symbols.PROB
-    LENGTH
+    LANG = symbols.LANG
    CLUSTER
    LEMMA
    POS
    TAG
    DEP
    ENT_IOB
    ENT_TYPE
    HEAD
    SENT_START
    SPACY
    PROB
    LANG
    ENT_KB_ID = symbols.ENT_KB_ID
-    MORPH
+    MORPH = symbols.MORPH
    ENT_ID = symbols.ENT_ID
-    IDX
+    IDX = symbols.IDX
    SENT_END
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -17,57 +17,11 @@ IDS = {
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "IS_CURRENCY": IS_CURRENCY,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
    "FLAG21": FLAG21,
    "FLAG22": FLAG22,
    "FLAG23": FLAG23,
    "FLAG24": FLAG24,
    "FLAG25": FLAG25,
    "FLAG26": FLAG26,
    "FLAG27": FLAG27,
    "FLAG28": FLAG28,
    "FLAG29": FLAG29,
    "FLAG30": FLAG30,
    "FLAG31": FLAG31,
    "FLAG32": FLAG32,
    "FLAG33": FLAG33,
    "FLAG34": FLAG34,
    "FLAG35": FLAG35,
    "FLAG36": FLAG36,
    "FLAG37": FLAG37,
    "FLAG38": FLAG38,
    "FLAG39": FLAG39,
    "FLAG40": FLAG40,
    "FLAG41": FLAG41,
    "FLAG42": FLAG42,
    "FLAG43": FLAG43,
    "FLAG44": FLAG44,
    "FLAG45": FLAG45,
    "FLAG46": FLAG46,
    "FLAG47": FLAG47,
    "FLAG48": FLAG48,
    "FLAG49": FLAG49,
    "FLAG50": FLAG50,
    "FLAG51": FLAG51,
    "FLAG52": FLAG52,
    "FLAG53": FLAG53,
    "FLAG54": FLAG54,
    "FLAG55": FLAG55,
    "FLAG56": FLAG56,
    "FLAG57": FLAG57,
    "FLAG58": FLAG58,
    "FLAG59": FLAG59,
    "FLAG60": FLAG60,
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
@ -93,12 +47,11 @@ IDS = {
 }
-# ATTR IDs, in order of the symbol
+NAMES = {v: k for k, v in IDS.items()}
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 locals().update(IDS)
-def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+def intify_attrs(stringy_attrs, strings_map=None):
    """
    Normalize a dictionary of attributes, converting them to ints.
@ -110,75 +63,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        converted to ints.
    """
    inty_attrs = {}
    if _do_deprecated:
        if "F" in stringy_attrs:
            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
        if "L" in stringy_attrs:
            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
        if "pos" in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
        if "morph" in stringy_attrs:
            morphs = stringy_attrs.pop("morph")  # no-cython-lint
        if "number" in stringy_attrs:
            stringy_attrs.pop("number")
        if "tenspect" in stringy_attrs:
            stringy_attrs.pop("tenspect")
        morph_keys = [
            "PunctType",
            "PunctSide",
            "Other",
            "Degree",
            "AdvType",
            "Number",
            "VerbForm",
            "PronType",
            "Aspect",
            "Tense",
            "PartType",
            "Poss",
            "Hyph",
            "ConjType",
            "NumType",
            "Foreign",
            "VerbType",
            "NounType",
            "Gender",
            "Mood",
            "Negative",
            "Tense",
            "Voice",
            "Abbr",
            "Derivation",
            "Echo",
            "Foreign",
            "NameType",
            "NounType",
            "NumForm",
            "NumValue",
            "PartType",
            "Polite",
            "StyleVariant",
            "PronType",
            "AdjType",
            "Person",
            "Variant",
            "AdpType",
            "Reflex",
            "Negative",
            "Mood",
            "Aspect",
            "Case",
            "Polarity",
            "PrepCase",
            "Animacy",  # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
                stringy_attrs.pop(key)
            elif key.lower() in stringy_attrs:
                stringy_attrs.pop(key.lower())
            elif key.upper() in stringy_attrs:
                stringy_attrs.pop(key.upper())
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -14,6 +14,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .distill import distill  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .find_function import find_function  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -11,6 +11,7 @@ from typing import (
    Dict,
    Iterable,
    List,
    Literal,
    Optional,
    Tuple,
    Union,
@ -28,7 +29,7 @@ from wasabi import Printer, msg
 from weasel import app as project_cli
 from .. import about
-from ..compat import Literal
+from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import validate
 from ..util import (
    ENV_VARS,
@ -148,6 +149,16 @@ def _parse_override(value: Any) -> Any:
        return str(value)
 def _handle_renamed_language_codes(lang: Optional[str]) -> None:
    # Throw error for renamed language codes in v4
    if lang in RENAMED_LANGUAGE_CODES:
        msg.fail(
            title="Renamed language code",
            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
            exits=1,
        )
@contextmanager
 def show_validation_error(
    file_path: Optional[Union[str, Path]] = None,
@ -192,6 +203,13 @@ def show_validation_error(
        msg.fail("Config validation error", e, exits=1)
 def import_code_paths(code_paths: str) -> None:
    """Helper to import comma-separated list of code paths."""
    code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
    for code_path in code_paths:
        import_code(code_path)
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
    """Helper to import Python file provided in training commands / commands
    using the config. This makes custom registered functions available.
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@ -11,7 +11,7 @@ from ._util import (
    Arg,
    Opt,
    app,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    show_validation_error,
 )
@ -26,7 +26,7 @@ def assemble_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    # fmt: on
 ):
@ -46,7 +46,7 @@ def assemble_cli(
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -16,7 +16,7 @@ from ..training.converters import (
    iob_to_docs,
    json_to_docs,
 )
-from ._util import Arg, Opt, app, walk_directory
+from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
@ -116,6 +116,10 @@ def convert(
    input_path = Path(input_path)
    if not msg:
        msg = Printer(no_print=silent)
    # Throw error for renamed language codes in v4
    _handle_renamed_language_codes(lang)
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(input_path, converter):
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -13,7 +13,7 @@ from ._util import (
    Arg,
    Opt,
    debug_cli,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    show_validation_error,
 )
@ -27,7 +27,7 @@ def debug_config_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
    # fmt: on
@ -44,7 +44,7 @@ def debug_config_cli(
    DOCS: https://spacy.io/api/cli#debug-config
    """
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    debug_config(
        config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
    )
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -7,6 +7,7 @@ from typing import (
    Dict,
    Iterable,
    List,
    Literal,
    Optional,
    Sequence,
    Set,
@ -22,7 +23,6 @@ import typer
 from wasabi import MESSAGES, Printer, msg
 from .. import util
 from ..compat import Literal
 from ..language import Language
 from ..morphology import Morphology
 from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
@ -40,7 +40,7 @@ from ._util import (
    _format_number,
    app,
    debug_cli,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    show_validation_error,
 )
@ -72,7 +72,7 @@ def debug_data_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
    no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@ -92,7 +92,7 @@ def debug_data_cli(
            "--help for an overview of the other available debugging commands."
        )
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    debug_data(
        config_path,
        config_overrides=overrides,
@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
    word_counts: Counter = Counter()
    for doc in docs:
        for token in doc:
-            # Normalize the text
+            t = token.text.lower()
            t = token.text.lower().replace("``", '"').replace("''", '"')
            word_counts[t] += 1
    if normalize:
        total = sum(word_counts.values(), 0.0)
--- a/spacy/cli/distill.py
+++ b/spacy/cli/distill.py
@ -0,0 +1,98 @@
 import logging
 import sys
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 import typer
 from wasabi import msg
 from .. import util
 from ..pipeline.trainable_pipe import TrainablePipe
 from ..schemas import ConfigSchemaDistill
 from ..training.initialize import init_nlp_student
 from ..training.loop import distill as distill_nlp
 from ._util import (
    Arg,
    Opt,
    app,
    import_code_paths,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
 )
@app.command(
    "distill",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def distill_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    teacher_model: str = Arg(..., help="Teacher model name or path"),
    student_config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    """
    Distill a spaCy pipeline from a teacher model.
    DOCS: https://spacy.io/api/cli#distill
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code_paths(code_path)
    distill(
        teacher_model,
        student_config_path,
        output_path,
        use_gpu=use_gpu,
        overrides=overrides,
    )
 def distill(
    teacher_model: Union[str, Path],
    student_config_path: Union[str, Path],
    output_path: Optional[Union[str, Path]] = None,
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
 ):
    student_config_path = util.ensure_path(student_config_path)
    output_path = util.ensure_path(output_path)
    # Make sure all files and paths exist if they are needed
    if not student_config_path or (
        str(student_config_path) != "-" and not student_config_path.exists()
    ):
        msg.fail("Student config file not found", student_config_path, exits=1)
    if not output_path:
        msg.info("No output directory provided")
    else:
        if not output_path.exists():
            output_path.mkdir(parents=True)
            msg.good(f"Created output directory: {output_path}")
        msg.info(f"Saving to output directory: {output_path}")
    setup_gpu(use_gpu)
    teacher = util.load_model(teacher_model)
    with show_validation_error(student_config_path):
        config = util.load_config(
            student_config_path, overrides=overrides, interpolate=False
        )
    msg.divider("Initializing student pipeline")
    with show_validation_error(student_config_path, hint_fill=False):
        student = init_nlp_student(config, teacher, use_gpu=use_gpu)
    msg.good("Initialized student pipeline")
    msg.divider("Distilling student pipeline from teacher")
    distill_nlp(
        teacher,
        student,
        output_path,
        use_gpu=use_gpu,
        stdout=sys.stdout,
        stderr=sys.stderr,
    )
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -7,9 +7,10 @@ import typer
 from wasabi import msg
 from .. import about
 from ..errors import OLD_MODEL_SHORTCUTS
 from ..util import (
    get_installed_models,
    get_minor_version,
    get_package_version,
    is_in_interactive,
    is_in_jupyter,
    is_package,
@ -76,15 +77,17 @@ def download(
        version = components[-1]
    else:
        model_name = model
        if model in OLD_MODEL_SHORTCUTS:
            msg.warn(
                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
            )
            model_name = OLD_MODEL_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
    # If we already have this version installed, skip downloading
    installed = get_installed_models()
    if model_name in installed:
        installed_version = get_package_version(model_name)
        if installed_version == version:
            msg.warn(f"{model_name} v{version} already installed, skipping")
            return
    filename = get_model_filename(model_name, version, sdist)
    download_model(filename, pip_args)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -10,7 +10,7 @@ from .. import displacy, util
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
@benchmark_cli.command(
@ -22,7 +22,7 @@ def evaluate_cli(
    model: str = Arg(..., help="Model name or path"),
    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
    output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@ -43,7 +43,7 @@ def evaluate_cli(
    DOCS: https://spacy.io/api/cli#benchmark-accuracy
    """
-    import_code(code_path)
+    import_code_paths(code_path)
    evaluate(
        model,
        data_path,
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,3 +1,4 @@
 import importlib.metadata
 import json
 import platform
 from pathlib import Path
@ -7,7 +8,6 @@ import srsly
 from wasabi import MarkdownRenderer, Printer
 from .. import about, util
 from ..compat import importlib_metadata
 from ._util import Arg, Opt, app, string_to_list
 from .download import get_latest_version, get_model_filename
@ -137,7 +137,7 @@ def info_installed_model_url(model: str) -> Optional[str]:
    dist-info available.
    """
    try:
-        dist = importlib_metadata.distribution(model)
+        dist = importlib.metadata.distribution(model)
        text = dist.read_text("direct_url.json")
        if isinstance(text, str):
            data = json.loads(text)
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -9,13 +9,14 @@ from thinc.api import Config
 from wasabi import Printer, diff_strings
 from .. import util
-from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
+from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
 from ._util import (
    COMMAND,
    Arg,
    Opt,
    _handle_renamed_language_codes,
    import_code,
    init_cli,
    show_validation_error,
@ -50,7 +51,7 @@ class InitValues:
 def init_config_cli(
    # fmt: off
    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
-    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
+    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
    pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
    optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
    gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@ -90,6 +91,7 @@ def init_fill_config_cli(
    # fmt: off
    base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
    output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
    distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@ -105,13 +107,20 @@ def init_fill_config_cli(
    DOCS: https://spacy.io/api/cli#init-fill-config
    """
    import_code(code_path)
-    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
+    fill_config(
        output_file,
        base_path,
        distillation=distillation,
        pretraining=pretraining,
        diff=diff,
    )
 def fill_config(
    output_file: Path,
    base_path: Path,
    *,
    distillation: bool = False,
    pretraining: bool = False,
    diff: bool = False,
    silent: bool = False,
@ -130,6 +139,9 @@ def fill_config(
    # replaced with their actual config after loading, so we have to re-add them
    sourced = util.get_sourced_components(config)
    filled["components"].update(sourced)
    if distillation:
        distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
        filled = distillation_config.merge(filled)
    if pretraining:
        validate_config_for_pretrain(filled, msg)
        pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
@ -165,6 +177,10 @@ def init_config(
    msg = Printer(no_print=silent)
    with TEMPLATE_PATH.open("r") as f:
        template = Template(f.read())
    # Throw error for renamed language codes in v4
    _handle_renamed_language_codes(lang)
    # Filter out duplicates since tok2vec and transformer are added by template
    pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
    defaults = RECOMMENDATIONS["__default__"]
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -12,6 +12,7 @@ from ..training.initialize import convert_vectors, init_nlp
 from ._util import (
    Arg,
    Opt,
    _handle_renamed_language_codes,
    import_code,
    init_cli,
    parse_config_overrides,
@ -29,7 +30,6 @@ def init_vectors_cli(
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
@ -39,8 +39,11 @@ def init_vectors_cli(
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
-    if verbose:
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-        util.logger.setLevel(logging.DEBUG)
+
    # Throw error for renamed language codes in v4
    _handle_renamed_language_codes(lang)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
@ -50,7 +53,6 @@ def init_vectors_cli(
        vectors_loc,
        truncate=truncate,
        prune=prune,
        name=name,
        mode=mode,
        attr=attr,
    )
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,3 +1,4 @@
 import importlib.metadata
 import os
 import re
 import shutil
@ -13,7 +14,6 @@ from thinc.api import Config
 from wasabi import MarkdownRenderer, Printer, get_raw_input
 from .. import about, util
 from ..compat import importlib_metadata
 from ..schemas import ModelMetaSchema, validate
 from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@ -23,7 +23,7 @@ def package_cli(
    # fmt: off
    input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
+    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
    create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
@ -250,9 +250,9 @@ def has_build() -> bool:
    # in an editable install), so an import check is not sufficient; instead
    # check that there is a package version
    try:
-        importlib_metadata.version("build")
+        importlib.metadata.version("build")
        return True
-    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
+    except importlib.metadata.PackageNotFoundError:  # type: ignore[attr-defined]
        return False
@ -352,7 +352,6 @@ def get_meta(
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
        "keys": nlp.vocab.vectors.n_keys,
        "name": nlp.vocab.vectors.name,
    }
    if about.__title__ != "spacy":
        meta["parent_package"] = about.__title__
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -11,7 +11,7 @@ from ._util import (
    Arg,
    Opt,
    app,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
@ -27,7 +27,7 @@ def pretrain_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@ -56,7 +56,7 @@ def pretrain_cli(
    DOCS: https://spacy.io/api/cli#pretrain
    """
    config_overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
    setup_gpu(use_gpu)
    msg.info(f"Loading config from: {config_path}")
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -238,7 +238,7 @@ grad_factor = 1.0
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
-get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+get_candidates = {"@misc":"spacy.CandidateGenerator.v2"}
 incl_context = true
 incl_prior = true
@ -517,7 +517,7 @@ width = ${components.tok2vec.model.encode.width}
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
-get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+get_candidates = {"@misc":"spacy.CandidateGenerator.v2"}
 incl_context = true
 incl_prior = true
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -13,7 +13,7 @@ from ._util import (
    Arg,
    Opt,
    app,
-    import_code,
+    import_code_paths,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
@ -28,7 +28,7 @@ def train_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
@ -50,7 +50,7 @@ def train_cli(
    if verbose:
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -23,19 +23,6 @@ try:
 except ImportError:
    cupy = None
 if sys.version_info[:2] >= (3, 8):  # Python 3.8+
    from typing import Literal, Protocol, runtime_checkable
 else:
    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
 # Important note: The importlib_metadata "backport" includes functionality
 # that's not part of the built-in importlib.metadata. We should treat this
 # import like the built-in and only use what's available there.
 try:  # Python 3.8+
    import importlib.metadata as importlib_metadata
 except ImportError:
    from catalogue import _importlib_metadata as importlib_metadata  # type: ignore[no-redef]    # noqa: F401
 from thinc.api import Optimizer  # noqa: F401
 pickle = pickle
--- a/spacy/default_config_distillation.cfg
+++ b/spacy/default_config_distillation.cfg
@ -0,0 +1,34 @@
 [paths]
 raw_text = null
 [distillation]
 corpus = "corpora.distillation"
 dropout = 0.1
 max_epochs = 1
 max_steps = 0
 student_to_teacher = {}
 [distillation.batcher]
@batchers = "spacy.batch_by_words.v1"
 size = 3000
 discard_oversize = false
 tolerance = 0.2
 [distillation.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 1e-4
 [corpora]
 [corpora.distillation]
@readers = "spacy.PlainTextCorpus.v1"
 path = ${paths.raw_text}
 min_length = 0
 max_length = 0
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,6 +1,7 @@
 import warnings
 from typing import Literal
-from .compat import Literal
+from . import about
 class ErrorsWithCodes(type):
@ -83,7 +84,7 @@ class Warnings(metaclass=ErrorsWithCodes):
            "ignoring the duplicate entry.")
    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
            "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
-    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+    W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
            "the Knowledge Base.")
    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
            "you are constructing a parse tree incrementally by setting "
@ -104,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
            "table. This may degrade the performance of the model to some "
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
+            "If this is surprising, make sure you are loading the table in "
-            "package installed and load the table in your config. The "
+            "your config. The languages with lexeme normalization tables are "
-            "languages with lexeme normalization tables are currently: "
+            "currently: {langs}\n\nAn example of how to load a table in "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "your config :\n\n"
            "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
            "lang = ${{nlp.lang}}\n"
             f'url = "{about.__lookups_url__}"\n'
            "tables = [\"lexeme_norm\"]\n")
    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
            "attribute or operator.")
@ -132,13 +134,6 @@ class Warnings(metaclass=ErrorsWithCodes):
            "and make it independent. For example, `replace_listeners = "
            "[\"model.tok2vec\"]` See the documentation for details: "
            "https://spacy.io/usage/training#config-components-listeners")
    W088 = ("The pipeline component {name} implements a `begin_training` "
            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
            "has been renamed to `initialize`, so you likely want to rename the "
            "component method. See the documentation for details: "
            "https://spacy.io/api/language#initialize")
    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
            "to `nlp.initialize`.")
    W090 = ("Could not locate any {format} files in path '{path}'.")
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -222,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
    W126 = ("These keys are unsupported: {unsupported}")
    W127 = ("Not all `Language.pipe` worker processes completed successfully")
    # v4 warning strings
    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
            "to return `True` in `.supports_prior_probs`.")
 class Errors(metaclass=ErrorsWithCodes):
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@ -256,9 +256,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "https://spacy.io/usage/models")
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
+    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
            "tag, ent, dep_tag_offset, ent_tag.")
    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
    E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
            "refers to an issue with the `Vocab` or `StringStore`.")
    E019 = ("Can't create transition with unknown action ID: {action}. Action "
@ -470,13 +468,13 @@ class Errors(metaclass=ErrorsWithCodes):
            "same, but found '{nlp}' and '{vocab}' respectively.")
    E152 = ("The attribute {attr} is not supported for token patterns. "
            "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
    E153 = ("The value type {vtype} is not supported for token patterns. "
            "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
    E154 = ("One of the attributes or values is not supported for token "
            "patterns. Please use the option `validate=True` with the Matcher, "
-            "PhraseMatcher, or EntityRuler for more details.")
+            "PhraseMatcher, or SpanRuler for more details.")
    E155 = ("The pipeline needs to include a {pipe} in order to use "
            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@ -500,7 +498,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "Current DocBin: {current}\nOther DocBin: {other}")
    E169 = ("Can't find module: {module}")
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
            "callable or None, but got: {arg_type}")
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
@ -739,13 +737,6 @@ class Errors(metaclass=ErrorsWithCodes):
            "method in component '{name}'. If you want to use this "
            "method, make sure it's overwritten on the subclass.")
    E940 = ("Found NaN values in scores.")
    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
            "load the model, use its full name instead:\n\n"
            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
            "models, see the models directory: https://spacy.io/models and if "
            "you want to create a blank model, use spacy.blank: "
            "nlp = spacy.blank(\"{name}\")")
    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
            "return an initialized nlp object but got: {value}. Maybe "
            "you forgot to return the modified object in your function?")
@ -759,7 +750,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "loaded nlp object, but got: {source}")
    E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
            "a string value from {expected} but got: '{arg}'")
-    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
            "a list, but got: {arg_type}")
    E949 = ("Unable to align tokens for the predicted and reference docs. It "
            "is only possible to align the docs when both texts are the same "
@ -933,8 +924,6 @@ class Errors(metaclass=ErrorsWithCodes):
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
             "exist.")
    E1024 = ("A pattern with {attr_type} '{label}' is not present in "
             "'{component}' patterns.")
    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
@ -945,7 +934,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E1029 = ("Edit tree cannot be applied to form.")
    E1030 = ("Edit tree identifier out of range.")
    E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
    E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
    E1034 = ("Node index {i} out of bounds ({length})")
    E1035 = ("Token index {i} out of bounds ({length})")
@ -962,7 +951,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "case pass an empty list for the previously not specified argument to avoid this error.")
    E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
             "{value}.")
    E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
    E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
             "method in '{name}'. If you want to use this method, make "
             "sure it's overwritten on the subclass.")
@ -989,15 +977,35 @@ class Errors(metaclass=ErrorsWithCodes):
             "reduction. Please enable one of `use_reduce_first`, "
             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
    # v4 error strings
    E4000 = ("Expected a Doc as input, but got: '{type}'")
    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
             "but got '{received_type}'")
    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
    E4003 = ("Training examples for distillation must have the exact same tokens in the "
             "reference and predicted docs.")
    E4004 = ("Backprop is not supported when is_train is not set.")
    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
             "{existing_value}.")
    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
             "[initialize] or in registered lookups (spacy-lookups-data). An "
             "example for how to load lemmatizer tables in [initialize]:\n\n"
             "[initialize.components]\n\n"
             "[initialize.components.{pipe_name}]\n\n"
             "[initialize.components.{pipe_name}.lookups]\n"
             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
             "lang = ${{nlp.lang}}\n"
             f'url = "{about.__lookups_url__}"\n'
             "tables = {tables}\n"
             "# or required tables only: tables = {required_tables}\n")
    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
 }
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 # fmt: on
--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -1,11 +1,10 @@
-from .candidate import Candidate, get_candidates, get_candidates_batch
+from .candidate import Candidate, InMemoryCandidate
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
 __all__ = [
    "Candidate",
    "KnowledgeBase",
    "InMemoryCandidate",
    "InMemoryLookupKB",
    "get_candidates",
    "get_candidates_batch",
 ]
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -1,15 +1,17 @@
 from libcpp.vector cimport vector
 from ..typedefs cimport hash_t
-from .kb cimport KnowledgeBase
+from .kb_in_memory cimport InMemoryLookupKB
 # Object used by the Entity Linker that summarizes one entity-alias candidate
 # combination.
 cdef class Candidate:
-    cdef readonly KnowledgeBase kb
+    pass
-    cdef hash_t entity_hash
+
-    cdef float entity_freq
+
-    cdef vector[float] entity_vector
+cdef class InMemoryCandidate(Candidate):
-    cdef hash_t alias_hash
+    cdef readonly hash_t _entity_hash
-    cdef float prior_prob
+    cdef readonly hash_t _alias_hash
    cdef vector[float] _entity_vector
    cdef float _prior_prob
    cdef readonly InMemoryLookupKB _kb
    cdef float _entity_freq
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -1,90 +1,98 @@
 # cython: infer_types=True
-from typing import Iterable
+from .kb_in_memory cimport InMemoryLookupKB
-from .kb cimport KnowledgeBase
+from ..errors import Errors
 from ..tokens import Span
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or
+    """A `Candidate` object refers to a textual mention that may or may not be resolved
-    may not be resolved to a specific `entity` from a Knowledge Base. This
+    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
-    will be used as input for the entity linking algorithm which will
+    algorithm which will disambiguate the various candidates to the correct one.
-    disambiguate the various candidates to the correct one.
+    Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
-    Each candidate (alias, entity) pair is assigned a certain prior probability.
+    is assigned a certain prior probability.
    DOCS: https://spacy.io/api/kb/#candidate-init
    """
    def __init__(self):
        # Make sure abstract Candidate is not instantiated.
        if self.__class__ == Candidate:
            raise TypeError(
                Errors.E1046.format(cls_name=self.__class__.__name__)
            )
    @property
    def entity_id(self) -> int:
        """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
        otherwise the hash of the entity ID string)."""
        raise NotImplementedError
    @property
    def entity_id_(self) -> str:
        """RETURNS (str): String representation of entity ID."""
        raise NotImplementedError
    @property
    def entity_vector(self) -> vector[float]:
        """RETURNS (vector[float]): Entity vector."""
        raise NotImplementedError
 cdef class InMemoryCandidate(Candidate):
    """Candidate for InMemoryLookupKB."""
    def __init__(
        self,
-        KnowledgeBase kb,
+        kb: InMemoryLookupKB,
-        entity_hash,
+        entity_hash: int,
-        entity_freq,
+        alias_hash: int,
-        entity_vector,
+        entity_vector: vector[float],
-        alias_hash,
+        prior_prob: float,
-        prior_prob
+        entity_freq: float
    ):
-        self.kb = kb
+        """
-        self.entity_hash = entity_hash
+        kb (InMemoryLookupKB]): InMemoryLookupKB instance.
-        self.entity_freq = entity_freq
+        entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
-        self.entity_vector = entity_vector
+        entity_freq (int): Entity frequency in KB corpus.
-        self.alias_hash = alias_hash
+        entity_vector (List[float]): Entity embedding.
-        self.prior_prob = prior_prob
+        alias_hash (int): Alias hash.
        prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
            the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
        """
        super().__init__()
        self._entity_hash = entity_hash
        self._entity_vector = entity_vector
        self._prior_prob = prior_prob
        self._kb = kb
        self._alias_hash = alias_hash
        self._entity_freq = entity_freq
    @property
-    def entity(self) -> int:
+    def entity_id(self) -> int:
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        return self._entity_hash
        return self.entity_hash
    @property
-    def entity_(self) -> str:
+    def entity_vector(self) -> vector[float]:
-        """RETURNS (str): ID/name of this entity in the KB"""
+        return self._entity_vector
        return self.kb.vocab.strings[self.entity_hash]
    @property
    def alias(self) -> int:
        """RETURNS (uint64): hash of the alias"""
        return self.alias_hash
    @property
    def alias_(self) -> str:
        """RETURNS (str): ID of the original alias"""
        return self.kb.vocab.strings[self.alias_hash]
    @property
    def entity_freq(self) -> float:
        return self.entity_freq
    @property
    def entity_vector(self) -> Iterable[float]:
        return self.entity_vector
    @property
    def prior_prob(self) -> float:
-        return self.prior_prob
+        """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
        this entity."""
        return self._prior_prob
    @property
    def alias(self) -> str:
        """RETURNS (str): Alias."""
        return self._kb.vocab.strings[self._alias_hash]
-def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    @property
-    """
+    def entity_id_(self) -> str:
-    Return candidate entities for a given mention and fetching appropriate
+        return self._kb.vocab.strings[self._entity_hash]
    entries from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Span): Entity mention for which to identify candidates.
    RETURNS (Iterable[Candidate]): Identified candidates.
    """
    return kb.get_candidates(mention)
-
+    @property
-def get_candidates_batch(
+    def entity_freq(self) -> float:
-        kb: KnowledgeBase, mentions: Iterable[Span]
+        """RETURNS (float): Entity frequency in KB corpus."""
-) -> Iterable[Iterable[Candidate]]:
+        return self._entity_freq
    """
    Return candidate entities for the given mentions and fetching appropriate entries
    from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Iterable[Span]): Entity mentions for which to identify candidates.
    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
    """
    return kb.get_candidates_batch(mentions)
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -1,14 +1,14 @@
 # cython: infer_types=True
 from pathlib import Path
-from typing import Iterable, Tuple, Union
+from typing import Iterable, Iterator, Tuple, Union
 from cymem.cymem cimport Pool
 from ..errors import Errors
-from ..tokens import Span
+from ..tokens import SpanGroup
 from ..util import SimpleFrozenList
-from .candidate import Candidate
+from .candidate cimport Candidate
 cdef class KnowledgeBase:
@ -19,6 +19,8 @@ cdef class KnowledgeBase:
    DOCS: https://spacy.io/api/kb
    """
    CandidatesForMentionT = Iterable[Candidate]
    CandidatesForDocT = Iterable[CandidatesForMentionT]
    def __init__(self, vocab: Vocab, entity_vector_length: int):
        """Create a KnowledgeBase."""
@ -32,27 +34,15 @@ cdef class KnowledgeBase:
        self.entity_vector_length = entity_vector_length
        self.mem = Pool()
-    def get_candidates_batch(
+    def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[CandidatesForDocT]:
        self, mentions: Iterable[Span]
    ) -> Iterable[Iterable[Candidate]]:
        """
-        Return candidate entities for specified texts. Each candidate defines
+        Return candidate entities for the specified groups of mentions (as SpanGroup) per Doc.
-        the entity, the original alias, and the prior probability of that
+        Each candidate for a mention defines at least the entity and the entity's embedding vector. Depending on the KB
-        alias resolving to that entity.
+        implementation, further properties - such as the prior probability of the specified mention text resolving to
-        If no candidate is found for a given text, an empty list is returned.
+        that entity - might be included.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
+        If no candidates are found for a given mention, an empty list is returned.
-        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+        mentions (Iterator[SpanGroup]): Mentions for which to get candidates.
-        """
+        RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mention/doc/doc batch.
        return [self.get_candidates(span) for span in mentions]
    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
        """
        Return candidate entities for specified text. Each candidate defines
        the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
        If the no candidate is found for a given text, an empty list is returned.
        mention (Span): Mention for which to get candidates.
        RETURNS (Iterable[Candidate]): Identified candidates.
        """
        raise NotImplementedError(
            Errors.E1045.format(
@ -128,3 +118,10 @@ cdef class KnowledgeBase:
                parent="KnowledgeBase", method="from_disk", name=self.__name__
            )
        )
    @property
    def supports_prior_probs(self) -> bool:
        """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
        raise NotImplementedError(
            Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
        )
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True
-from typing import Any, Callable, Dict, Iterable
+from typing import Any, Callable, Dict, Iterable, Iterator
 import srsly
@ -12,7 +12,7 @@ from preshed.maps cimport PreshMap
 import warnings
 from pathlib import Path
-from ..tokens import Span
+from ..tokens import SpanGroup
 from ..typedefs cimport hash_t
@ -23,7 +23,7 @@ from ..util import SimpleFrozenList, ensure_path
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
-from .candidate import Candidate as Candidate
+from .candidate import InMemoryCandidate
 cdef class InMemoryLookupKB(KnowledgeBase):
@ -255,10 +255,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry
-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+    def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[InMemoryCandidate]]]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+        for mentions_for_doc in mentions:
            yield [self._get_alias_candidates(span.text) for span in mentions_for_doc]
-    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
        """
        Return candidate entities for an alias. Each candidate defines the
        entity, the original alias, and the prior probability of that alias
@ -271,18 +272,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
-        return [Candidate(kb=self,
+        return [
            InMemoryCandidate(
                kb=self,
                entity_hash=self._entries[entry_index].entity_hash,
                          entity_freq=self._entries[entry_index].freq,
                          entity_vector=self._vectors_table[
                              self._entries[entry_index].vector_index
                          ],
                alias_hash=alias_hash,
-                          prior_prob=prior_prob)
+                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
-                for (entry_index, prior_prob) in zip(
+                prior_prob=prior_prob,
-                    alias_entry.entry_indices, alias_entry.probs
+                entity_freq=self._entries[entry_index].freq
            )
-                if entry_index != 0]
+            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
            if entry_index != 0
        ]
    def get_vector(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings[entity]
@ -316,6 +317,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        return 0.0
    def supports_prior_probs(self) -> bool:
        return True
    def to_bytes(self, **kwargs):
        """Serialize the current state to a binary string.
        """
--- a/spacy/lang/isl/init.py
+++ b/spacy/lang/isl/init.py
@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
 class Icelandic(Language):
-    lang = "is"
+    lang = "isl"
    Defaults = IcelandicDefaults
--- a/spacy/lang/isl/stop_words.py
+++ b/spacy/lang/isl/stop_words.py
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -17,21 +17,100 @@ DEFAULT_CONFIG = """
 [nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
 mecab_args = ""
 """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
-def create_tokenizer():
+def create_tokenizer(mecab_args: str):
    def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp.vocab)
+        return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
    return korean_tokenizer_factory
 class KoreanTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
        self.vocab = vocab
        mecab = try_mecab_import()
        self.mecab_tokenizer = mecab.Tagger(mecab_args)
    def __reduce__(self):
        return KoreanTokenizer, (self.vocab,)
    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
        surfaces = [dt["surface"] for dt in dtokens]
        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
            if token.tag_ in TAG_MAP:
                token.pos = TAG_MAP[token.tag_][POS]
            else:
                token.pos = X
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc
    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
        for line in self.mecab_tokenizer.parse(text).split("\n"):
            if line == "EOS":
                break
            surface, _, expr = line.partition("\t")
            features = expr.split("/")[0].split(",")
            tag = features[0]
            lemma = "*"
            if len(features) >= 8:
                lemma = features[7]
            if lemma == "*":
                lemma = surface
            yield {"surface": surface, "lemma": lemma, "tag": tag}
    def score(self, examples):
        validate_examples(examples, "KoreanTokenizer.score")
        return Scorer.score_tokenization(examples)
 class KoreanDefaults(BaseDefaults):
    config = load_config_from_str(DEFAULT_CONFIG)
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
    infixes = TOKENIZER_INFIXES
 class Korean(Language):
    lang = "ko"
    Defaults = KoreanDefaults
 def try_mecab_import():
    try:
        import mecab_ko as MeCab
        return MeCab
    except ImportError:
        raise ImportError(
            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
            "the python package `mecab-ko`: pip install mecab-ko"
        ) from None
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
 def create_natto_tokenizer():
    def korean_natto_tokenizer_factory(nlp):
        return KoreanNattoTokenizer(nlp.vocab)
    return korean_natto_tokenizer_factory
 class KoreanNattoTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab):
        self.vocab = vocab
-        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab = self._try_mecab_import()  # type: ignore[func-returns-value]
        self._mecab_tokenizer = None
    @property
@ -47,7 +126,7 @@ class KoreanTokenizer(DummyTokenizer):
        return self._mecab_tokenizer
    def __reduce__(self):
-        return KoreanTokenizer, (self.vocab,)
+        return KoreanNattoTokenizer, (self.vocab,)
    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
@ -74,7 +153,7 @@ class KoreanTokenizer(DummyTokenizer):
            feature = node.feature
            tag, _, expr = feature.partition(",")
            lemma, _, remainder = expr.partition("/")
-            if lemma == "*":
+            if lemma == "*" or lemma == "":
                lemma = surface
            yield {"surface": surface, "lemma": lemma, "tag": tag}
@ -82,28 +161,14 @@ class KoreanTokenizer(DummyTokenizer):
        validate_examples(examples, "KoreanTokenizer.score")
        return Scorer.score_tokenization(examples)
-
+    def _try_mecab_import(self):
 class KoreanDefaults(BaseDefaults):
    config = load_config_from_str(DEFAULT_CONFIG)
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
    infixes = TOKENIZER_INFIXES
 class Korean(Language):
    lang = "ko"
    Defaults = KoreanDefaults
 def try_mecab_import() -> None:
        try:
            from natto import MeCab
            return MeCab
        except ImportError:
            raise ImportError(
-            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
+                'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
                "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
                "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
                "and [natto-py](https://github.com/buruzaemon/natto-py)"
--- a/spacy/lang/mul/init.py
+++ b/spacy/lang/mul/init.py
@ -3,10 +3,10 @@ from ...language import Language
 class MultiLanguage(Language):
    """Language class to be used for models that support multiple languages.
-    This module allows models to specify their language ID as 'xx'.
+    This module allows models to specify their language ID as 'mul'.
    """
-    lang = "xx"
+    lang = "mul"
 __all__ = ["MultiLanguage"]
--- a/spacy/lang/mul/examples.py
+++ b/spacy/lang/mul/examples.py
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -16,10 +16,6 @@ URL_PATTERN = (
    r"(?:\S+(?::\S*)?@)?"
    r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -31,7 +31,7 @@ segmenter = "char"
 [initialize]
 [initialize.tokenizer]
-pkuseg_model = null
+pkuseg_model = "spacy_ontonotes"
 pkuseg_user_dict = "default"
 """
--- a/spacy/language.py
+++ b/spacy/language.py
@ -18,6 +18,7 @@ from typing import (
    Iterable,
    Iterator,
    List,
    Literal,
    NoReturn,
    Optional,
    Pattern,
@ -34,7 +35,6 @@ import srsly
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops
 from . import about, ty, util
 from .compat import Literal
 from .errors import Errors, Warnings
 from .git_info import GIT_VERSION
 from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -52,7 +52,7 @@ from .scorer import Scorer
 from .tokenizer import Tokenizer
 from .tokens import Doc
 from .tokens.underscore import Underscore
-from .training import Example, validate_examples
+from .training import Example, validate_distillation_examples, validate_examples
 from .training.initialize import init_tok2vec, init_vocab
 from .util import (
    _DEFAULT_EMPTY_PIPES,
@ -74,6 +74,9 @@ PipeCallable = Callable[[Doc], Doc]
 # This is the base config will all settings (training etc.)
 DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
 DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
 # This is the base config for the [distillation] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
@ -127,13 +130,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
    lookups = load_lookups(lang=lang, tables=tables)
    return lookups
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -198,8 +194,7 @@ class Language:
        if not isinstance(vocab, Vocab) and vocab is not True:
            raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
        if vocab is True:
-            vectors_name = meta.get("vectors", {}).get("name")
+            vocab = create_vocab(self.lang, self.Defaults)
            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
            if not create_vectors:
                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
                create_vectors = registry.resolve(vectors_cfg)["vectors"]
@ -257,7 +252,6 @@ class Language:
            "width": self.vocab.vectors_length,
            "vectors": len(self.vocab.vectors),
            "keys": self.vocab.vectors.n_keys,
            "name": self.vocab.vectors.name,
            "mode": self.vocab.vectors.mode,
        }
        self._meta["labels"] = dict(self.pipe_labels)
@ -768,8 +762,8 @@ class Language:
        *,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
-        last: Optional[bool] = None,
+        last: Optional[Literal[True]] = None,
        source: Optional["Language"] = None,
        config: Dict[str, Any] = SimpleFrozenDict(),
        raw_config: Optional[Config] = None,
@ -788,8 +782,8 @@ class Language:
            component directly before.
        after (Union[str, int]): Name or index of the component to insert new
            component directly after.
-        first (bool): If True, insert component first in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
        source (Language): Optional loaded nlp object to copy the pipeline
            component from.
        config (Dict[str, Any]): Config parameters to use for this component.
@ -835,18 +829,22 @@ class Language:
        self,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
-        last: Optional[bool] = None,
+        last: Optional[Literal[True]] = None,
    ) -> int:
        """Determine where to insert a pipeline component based on the before/
        after/first/last values.
        before (str): Name or index of the component to insert directly before.
        after (str): Name or index of component to insert directly after.
-        first (bool): If True, insert component first in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
        RETURNS (int): The index of the new pipeline component.
        """
        if first is not None and first is not True:
            raise ValueError(Errors.E4009.format(attr="first", value=first))
        if last is not None and last is not True:
            raise ValueError(Errors.E4009.format(attr="last", value=last))
        all_args = {"before": before, "after": after, "first": first, "last": last}
        if sum(arg is not None for arg in [before, after, first, last]) >= 2:
            raise ValueError(
@ -1056,6 +1054,116 @@ class Language:
                raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
        return doc
    def distill(
        self,
        teacher: "Language",
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
        sgd: Union[Optimizer, None, Literal[False]] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
        annotates: Iterable[str] = SimpleFrozenList(),
        student_to_teacher: Optional[Dict[str, str]] = None,
    ):
        """Distill the models in a student pipeline from a teacher pipeline.
        teacher (Language): Teacher to distill from.
        examples (Iterable[Example]): Distillation examples. The reference
            (teacher) and predicted (student) docs must have the same number of
            tokens and the same orthography.
        drop (float): The dropout rate.
        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
            be created via create_optimizer if 'None'. No optimizer will
            be used when set to 'False'.
        losses (Optional(Dict[str, float])): Dictionary to update with the loss,
            keyed by component.
        component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
            for specific pipeline components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
        annotates (Iterable[str]): Names of components that should set
            annotations on the predicted examples after updating.
        student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
            teacher pipe name, only needed for pipes where the student pipe
            name does not match the teacher pipe name.
        RETURNS (Dict[str, float]): The updated losses dictionary
        DOCS: https://spacy.io/api/language#distill
        """
        if student_to_teacher is None:
            student_to_teacher = {}
        if losses is None:
            losses = {}
        if isinstance(examples, list) and len(examples) == 0:
            return losses
        validate_distillation_examples(examples, "Language.distill")
        examples = _copy_examples(examples, copy_x=True, copy_y=True)
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = self.create_optimizer()
            sgd = self._optimizer
        if component_cfg is None:
            component_cfg = {}
        pipe_kwargs = {}
        for student_name, student_proc in self.pipeline:
            component_cfg.setdefault(student_name, {})
            pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
            component_cfg[student_name].setdefault("drop", drop)
            pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
        teacher_pipes = dict(teacher.pipeline)
        for student_name, student_proc in self.pipeline:
            if student_name in annotates:
                for doc, eg in zip(
                    _pipe(
                        (eg.predicted for eg in examples),
                        proc=student_proc,
                        name=student_name,
                        default_error_handler=self.default_error_handler,
                        kwargs=pipe_kwargs[student_name],
                    ),
                    examples,
                ):
                    eg.predicted = doc
            if (
                student_name not in exclude
                and isinstance(student_proc, ty.DistillableComponent)
                and student_proc.is_distillable
            ):
                # A missing teacher pipe is not an error, some student pipes
                # do not need a teacher, such as tok2vec layer losses.
                teacher_name = (
                    student_to_teacher[student_name]
                    if student_name in student_to_teacher
                    else student_name
                )
                teacher_pipe = teacher_pipes.get(teacher_name, None)
                student_proc.distill(
                    teacher_pipe,
                    examples,
                    sgd=None,
                    losses=losses,
                    **component_cfg[student_name],
                )
        # Only finish the update after all component updates are done. Some
        # components may share weights (such as tok2vec) and we only want
        # to apply weight updates after all gradients are accumulated.
        for student_name, student_proc in self.pipeline:
            if (
                student_name not in exclude
                and isinstance(student_proc, ty.DistillableComponent)
                and student_proc.is_distillable
                and sgd not in (None, False)
            ):
                student_proc.finish_update(sgd)
        return losses
    def disable_pipes(self, *names) -> "DisabledPipes":
        """Disable one or more pipeline components. If used as a context
        manager, the pipeline will be restored to the initial state at the end
@ -1144,7 +1252,7 @@ class Language:
        _: Optional[Any] = None,
        *,
        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
@ -1155,7 +1263,9 @@ class Language:
        examples (Iterable[Example]): A batch of examples
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
            be created via create_optimizer if 'None'. No optimizer will
            be used when set to 'False'.
        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@ -1188,17 +1298,12 @@ class Language:
            component_cfg[name].setdefault("drop", drop)
            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
            # ignore statements are used here because mypy ignores hasattr
            if name not in exclude and hasattr(proc, "update"):
                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
            if sgd not in (None, False):
            if (
                name not in exclude
                and isinstance(proc, ty.TrainableComponent)
                and proc.is_trainable
                    and proc.model not in (True, False, None)
            ):
-                    proc.finish_update(sgd)
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
            if name in annotates:
                for doc, eg in zip(
                    _pipe(
@ -1211,6 +1316,18 @@ class Language:
                    examples,
                ):
                    eg.predicted = doc
        # Only finish the update after all component updates are done. Some
        # components may share weights (such as tok2vec) and we only want
        # to apply weight updates after all gradients are accumulated.
        for name, proc in self.pipeline:
            if (
                name not in exclude
                and isinstance(proc, ty.TrainableComponent)
                and proc.is_trainable
                and sgd not in (None, False)
            ):
                proc.finish_update(sgd)
        return losses
    def rehearse(
@ -1277,25 +1394,20 @@ class Language:
            sgd(key, W, dW)  # type: ignore[call-arg, misc]
        return losses
    def begin_training(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
        warnings.warn(Warnings.W089, DeprecationWarning)
        return self.initialize(get_examples, sgd=sgd)
    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        labels: Optional[Dict[str, Any]] = None,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
        """Initialize the pipe for training, using data examples if available.
        get_examples (Callable[[], Iterable[Example]]): Optional function that
            returns gold-standard Example objects.
        labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
            using the names of the pipes as keys. Overrides labels that are in
            the model configuration.
        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
            provided, will be created using the .create_optimizer() method.
        RETURNS (thinc.api.Optimizer): The optimizer.
@ -1343,6 +1455,8 @@ class Language:
        for name, proc in self.pipeline:
            if isinstance(proc, ty.InitializableComponent):
                p_settings = I["components"].get(name, {})
                if labels is not None and name in labels:
                    p_settings["labels"] = labels[name]
                p_settings = validate_init_settings(
                    proc.initialize, p_settings, section="components", name=name
                )
@ -1816,6 +1930,7 @@ class Language:
        # using the nlp.config with all defaults.
        config = util.copy_config(config)
        orig_pipeline = config.pop("components", {})
        orig_distill = config.pop("distillation", None)
        orig_pretraining = config.pop("pretraining", None)
        config["components"] = {}
        if auto_fill:
@ -1824,6 +1939,9 @@ class Language:
            filled = config
        filled["components"] = orig_pipeline
        config["components"] = orig_pipeline
        if orig_distill is not None:
            filled["distillation"] = orig_distill
            config["distillation"] = orig_distill
        if orig_pretraining is not None:
            filled["pretraining"] = orig_pretraining
            config["pretraining"] = orig_pretraining
@ -2176,9 +2294,6 @@ class Language:
            if path.exists():
                data = srsly.read_json(path)
                self.meta.update(data)
                # self.meta always overrides meta["vectors"] with the metadata
                # from self.vocab.vectors, so set the name directly
                self.vocab.vectors.name = data.get("vectors", {}).get("name")
        def deserialize_vocab(path: Path) -> None:
            if path.exists():
@ -2247,9 +2362,6 @@ class Language:
        def deserialize_meta(b):
            data = srsly.json_loads(b)
            self.meta.update(data)
            # self.meta always overrides meta["vectors"] with the metadata
            # from self.vocab.vectors, so set the name directly
            self.vocab.vectors.name = data.get("vectors", {}).get("name")
        deserializers: Dict[str, Callable[[bytes], Any]] = {}
        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
@ -2316,13 +2428,18 @@ class DisabledPipes(list):
        self[:] = []
-def _copy_examples(examples: Iterable[Example]) -> List[Example]:
+def _copy_examples(
    examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
 ) -> List[Example]:
    """Make a copy of a batch of examples, copying the predicted Doc as well.
    This is used in contexts where we need to take ownership of the examples
    so that they can be mutated, for instance during Language.evaluate and
    Language.update.
    """
-    return [Example(eg.x.copy(), eg.y) for eg in examples]
+    return [
        Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
        for eg in examples
    ]
 def _apply_pipes(
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -12,7 +12,6 @@ from .attrs cimport (
    SUFFIX,
    attr_id_t,
 )
 from .strings cimport StringStore
 from .structs cimport LexemeC
 from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@ -19,7 +19,6 @@ class Lexeme:
    def vector_norm(self) -> float: ...
    vector: Floats1d
    rank: int
    sentiment: float
    @property
    def orth_(self) -> str: ...
    @property
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -57,7 +57,7 @@ cdef class Lexeme:
        """
        self.vocab = vocab
        self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
        if self.c.orth != orth:
            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
@ -193,20 +193,6 @@ cdef class Lexeme:
    def rank(self, value):
        self.c.id = value
    @property
    def sentiment(self):
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the lexeme."""
        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
        return sentiment_table.get(self.c.orth, 0.0)
    @sentiment.setter
    def sentiment(self, float x):
        if "lexeme_sentiment" not in self.vocab.lookups:
            self.vocab.lookups.add_table("lexeme_sentiment")
        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
        sentiment_table[self.c.orth] = x
    @property
    def orth_(self):
        """RETURNS (str): The original verbatim text of the lexeme
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -2,16 +2,40 @@ from collections import OrderedDict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 import requests
 import srsly
 from preshed.bloom import BloomFilter
 from .errors import Errors
 from .strings import get_string_id
-from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
+from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
 UNSET = object()
@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
    lookups = load_lookups(lang=lang, tables=tables)
    return lookups
@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
 def load_lookups_data_from_url(lang, tables, url):
    logger.debug(f"Loading lookups from {url}: {tables}")
    lookups = Lookups()
    for table in tables:
        table_url = url + lang + "_" + table + ".json"
        r = requests.get(table_url)
        if r.status_code != 200:
            raise ValueError(
                Errors.E4011.format(status_code=r.status_code, url=table_url)
            )
        table_data = r.json()
        lookups.add_table(table, table_data)
    return lookups
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
    """Load the data from the spacy-lookups-data package for a given language,
    if available. Returns an empty `Lookups` container if there's no data or if the package
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -175,9 +175,9 @@ cdef class DependencyMatcher:
        on_match (callable): Optional callback executed on match.
        """
        if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
+            raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
+        if patterns is None or not isinstance(patterns, List):
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -5,13 +5,13 @@ from typing import (
    Iterable,
    Iterator,
    List,
    Literal,
    Optional,
    Tuple,
    Union,
    overload,
 )
 from ..compat import Literal
 from ..tokens import Doc, Span
 from ..vocab import Vocab
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -20,6 +20,12 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
 from .levenshtein import levenshtein_compare
 from ..strings cimport get_string_id
 from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
@ -113,9 +119,9 @@ cdef class Matcher:
        """
        errors = {}
        if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
+            raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
+        if patterns is None or not isinstance(patterns, List):
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
        if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
            raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
        for i, pattern in enumerate(patterns):
@ -275,6 +281,10 @@ cdef class Matcher:
        # non-overlapping ones this `match` can be either (start, end) or
        # (start, end, alignments) depending on `with_alignments=` option.
        for key, *match in matches:
            # Adjust span matches to doc offsets
            if isinstance(doclike, Span):
                match[0] += doclike.start
                match[1] += doclike.start
            span_filter = self._filter.get(key)
            if span_filter is not None:
                pairs = pairs_by_id.get(key, [])
@ -305,9 +315,6 @@ cdef class Matcher:
        if as_spans:
            final_results = []
            for key, start, end, *_ in final_matches:
                if isinstance(doclike, Span):
                    start += doclike.start
                    end += doclike.start
                final_results.append(Span(doc, start, end, label=key))
        elif with_alignments:
            # convert alignments List[Dict[str, int]] --> List[int]
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@ -1,6 +1,5 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
 from ..compat import Literal
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
@ -21,6 +20,15 @@ class PhraseMatcher:
            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
        ] = ...,
    ) -> None: ...
    def _add_from_arrays(
        self,
        key: str,
        specs: List[List[int]],
        *,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
        ] = ...,
    ) -> None: ...
    def remove(self, key: str) -> None: ...
    @overload
    def __call__(
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -1,4 +1,7 @@
 # cython: infer_types=True
 from collections import defaultdict
 from typing import List
 from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 import warnings
@ -39,7 +42,7 @@ cdef class PhraseMatcher:
        """
        self.vocab = vocab
        self._callbacks = {}
-        self._docs = {}
+        self._docs = defaultdict(set)
        self._validate = validate
        self.mem = Pool()
@ -155,41 +158,69 @@ cdef class PhraseMatcher:
        del self._callbacks[key]
        del self._docs[key]
-    def add(self, key, docs, *_docs, on_match=None):
+    def _add_from_arrays(self, key, specs, *, on_match=None):
-        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
+        """Add a preprocessed list of specs, with an optional callback.
        key, an on_match callback, and one or more patterns.
        Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
        second argument, with the on_match callback as an optional keyword
        argument.
        key (str): The match ID.
-        docs (list): List of `Doc` objects representing match patterns.
+        specs (List[List[int]]): A list of lists of hashes to match.
        on_match (callable): Callback executed on match.
        *_docs (Doc): For backwards compatibility: list of patterns to add
            as variable arguments. Will be ignored if a list of patterns is
            provided as the second argument.
        DOCS: https://spacy.io/api/phrasematcher#add
        """
        if docs is None or hasattr(docs, "__call__"):  # old API
            on_match = docs
            docs = _docs
        _ = self.vocab[key]
        self._callbacks[key] = on_match
        self._docs.setdefault(key, set())
        cdef MapStruct* current_node
        cdef MapStruct* internal_node
        cdef void* result
        self._callbacks[key] = on_match
        for spec in specs:
            self._docs[key].add(tuple(spec))
            current_node = self.c_map
            for token in spec:
                if token == self._terminal_hash:
                    warnings.warn(Warnings.W021)
                    break
                result = <MapStruct*>map_get(current_node, token)
                if not result:
                    internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                    map_init(self.mem, internal_node, 8)
                    map_set(self.mem, current_node, token, internal_node)
                    result = internal_node
                current_node = <MapStruct*>result
            result = <MapStruct*>map_get(current_node, self._terminal_hash)
            if not result:
                internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                map_init(self.mem, internal_node, 8)
                map_set(self.mem, current_node, self._terminal_hash, internal_node)
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
    def add(self, key, docs, *, on_match=None):
        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
        key, a list of one or more patterns, and (optionally) an on_match callback.
        key (str): The match ID.
        docs (list): List of `Doc` objects representing match patterns.
        on_match (callable): Callback executed on match.
        If any of the input Docs are invalid, no internal state will be updated.
        DOCS: https://spacy.io/api/phrasematcher#add
        """
        if isinstance(docs, Doc):
            raise ValueError(Errors.E179.format(key=key))
        if docs is None or not isinstance(docs, List):
            raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
        if on_match is not None and not hasattr(on_match, "__call__"):
            raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
        _ = self.vocab[key]
        specs = []
        for doc in docs:
            if len(doc) == 0:
                continue
-            if isinstance(doc, Doc):
+            if not isinstance(doc, Doc):
                raise ValueError(Errors.E4000.format(type=type(doc)))
            attrs = (TAG, POS, MORPH, LEMMA, DEP)
            has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
            for attr in attrs:
@ -208,30 +239,9 @@ cdef class PhraseMatcher:
                    and self.attr not in attrs:
                string_attr = self.vocab.strings[self.attr]
                warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
-                keyword = self._convert_to_array(doc)
+            specs.append(self._convert_to_array(doc))
            else:
                keyword = doc
            self._docs[key].add(tuple(keyword))
-            current_node = self.c_map
+        self._add_from_arrays(key, specs, on_match=on_match)
            for token in keyword:
                if token == self._terminal_hash:
                    warnings.warn(Warnings.W021)
                    break
                result = <MapStruct*>map_get(current_node, token)
                if not result:
                    internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                    map_init(self.mem, internal_node, 8)
                    map_set(self.mem, current_node, token, internal_node)
                    result = internal_node
                current_node = <MapStruct*>result
            result = <MapStruct*>map_get(current_node, self._terminal_hash)
            if not result:
                internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                map_init(self.mem, internal_node, 8)
                map_set(self.mem, current_node, self._terminal_hash, internal_node)
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
    def __call__(self, object doclike, *, as_spans=False):
        """Find all sequences matching the supplied patterns on the `Doc`.
@ -345,7 +355,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
    matcher = PhraseMatcher(vocab, attr=attr)
    for key, specs in docs.items():
        callback = callbacks.get(key, None)
-        matcher.add(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
    return matcher
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
    "update",
    "rehearse",
    "get_loss",
    "get_teacher_student_loss",
    "initialize",
    "begin_update",
    "finish_update",
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Callable, Iterable, List, Optional, Tuple
+from typing import Callable, Iterable, Iterator, List, Optional, Tuple
 from thinc.api import (
    Linear,
@ -15,18 +15,15 @@ from thinc.api import (
 from thinc.types import Floats2d
 from ...errors import Errors
-from ...kb import (
+from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
-    Candidate,
+from ...tokens import Doc, Span, SpanGroup
    InMemoryLookupKB,
    KnowledgeBase,
    get_candidates,
    get_candidates_batch,
 )
 from ...tokens import Doc, Span
 from ...util import registry
 from ...vocab import Vocab
 from ..extract_spans import extract_spans
 CandidatesForMentionT = Iterable[Candidate]
 CandidatesForDocT = Iterable[CandidatesForMentionT]
@registry.architectures("spacy.EntityLinker.v2")
 def build_nel_encoder(
@ -123,12 +120,38 @@ def empty_kb(
@registry.misc("spacy.CandidateGenerator.v1")
-def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
+def create_get_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
    return get_candidates
-@registry.misc("spacy.CandidateBatchGenerator.v1")
+@registry.misc("spacy.CandidateGenerator.v2")
-def create_candidates_batch() -> Callable[
+def create_get_candidates_v2() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    [KnowledgeBase, Iterator[SpanGroup]], Iterator[CandidatesForDocT]
 ]:
-    return get_candidates_batch
+    return get_candidates_v2
 def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
    """
    Return candidate entities for the given mention from the KB.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Span): Entity mention.
    RETURNS (Iterable[Candidate]): Identified candidates for specified mention.
    """
    cands_per_doc = next(
        get_candidates_v2(kb, iter([SpanGroup(mention.doc, spans=[mention])]))
    )
    assert isinstance(cands_per_doc, list)
    return next(cands_per_doc[0])
 def get_candidates_v2(
    kb: KnowledgeBase, mentions: Iterator[SpanGroup]
 ) -> Iterator[Iterable[Iterable[Candidate]]]:
    """
    Return candidate entities for the given mentions from the KB.
    kb (KnowledgeBase): Knowledge base to query.
    mentions (Iterator[SpanGroup]): Mentions per doc.
    RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mentions in document/SpanGroup.
    """
    return kb.get_candidates(mentions)
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -1,9 +1,8 @@
-from typing import List, Optional, cast
+from typing import List, Literal, Optional
 from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -19,6 +19,7 @@ from thinc.api import (
    clone,
    concatenate,
    list2ragged,
    noop,
    reduce_first,
    reduce_last,
    reduce_max,
@ -148,55 +149,26 @@ def build_text_classifier_v2(
    linear_model: Model[List[Doc], Floats2d],
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    # TODO: build the model with _build_parametric_attention_with_residual_nonlinear
    # in spaCy v4. We don't do this in spaCy v3 to preserve model
    # compatibility.
    exclusive_classes = not linear_model.attrs["multi_label"]
    with Model.define_operators({">>": chain, "|": concatenate}):
    width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention(width)
+    exclusive_classes = not linear_model.attrs["multi_label"]
-        maxout_layer = Maxout(nO=width, nI=width)
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
-        norm_layer = LayerNorm(nI=width)
+        tok2vec=tok2vec,
-        cnn_model = (
+        nonlinear_layer=Maxout(nI=width, nO=width),
-            tok2vec
+        key_transform=noop(),
            >> list2ragged()
            >> attention_layer
            >> reduce_sum()
            >> residual(maxout_layer >> norm_layer >> Dropout(0.0))
    )
-
+    with Model.define_operators({">>": chain, "|": concatenate}):
        nO_double = nO * 2 if nO else None
        if exclusive_classes:
            output_layer = Softmax(nO=nO, nI=nO_double)
        else:
            output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
-        model = (linear_model | cnn_model) >> output_layer
+        model = (linear_model | parametric_attention) >> output_layer
        model.set_ref("tok2vec", tok2vec)
    if model.has_dim("nO") is not False and nO is not None:
        model.set_dim("nO", cast(int, nO))
    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
    model.set_ref("attention_layer", attention_layer)
    model.set_ref("maxout_layer", maxout_layer)
    model.set_ref("norm_layer", norm_layer)
    model.attrs["multi_label"] = not exclusive_classes
    model.init = init_ensemble_textcat  # type: ignore[assignment]
    return model
 def init_ensemble_textcat(model, X, Y) -> Model:
    # When tok2vec is lazily initialized, we need to initialize it before
    # the rest of the chain to ensure that we can get its width.
    tok2vec = model.get_ref("tok2vec")
    tok2vec.initialize(X)
    tok2vec_width = get_tok2vec_width(model)
    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
    model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
    model.get_ref("maxout_layer").set_dim("nI", tok2vec_width)
    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
    init_chain(model, X, Y)
    return model
@ -284,7 +256,9 @@ def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
    tok2vec_width = get_tok2vec_width(model)
    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
    if model.get_ref("key_transform").has_dim("nI") is None:
        model.get_ref("key_transform").set_dim("nI", tok2vec_width)
    if model.get_ref("key_transform").has_dim("nO") is None:
        model.get_ref("key_transform").set_dim("nO", tok2vec_width)
    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -21,7 +21,7 @@ from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 from ...attrs import intify_attr
 from ...errors import Errors
-from ...ml import _character_embed
+from ...ml import character_embed
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@ -241,7 +241,7 @@ def CharacterEmbed(
    if feature is None:
        raise ValueError(Errors.E911.format(feat=feature))
    char_embed = chain(
-        _character_embed.CharacterEmbed(nM=nM, nC=nC),
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
        cast(Model[List[Floats2d], Ragged], list2ragged()),
    )
    feature_extractor: Model[List[Doc], Ragged] = chain(
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@ -40,16 +40,10 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 cdef void free_activations(const ActivationsC* A) nogil
-cdef void predict_states(
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
+                         const WeightsC* W, SizesC n) nogil
 ) nogil
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-cdef void cpu_log_loss(
+cdef void cpu_log_loss(float* d_scores, const float* costs,
-    float* d_scores,
+                       const int* is_valid, const float* scores, int O) nogil
    const float* costs,
    const int* is_valid,
    const float* scores,
    int O
 ) nogil
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@ -5,11 +5,10 @@ from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 from thinc.backends.linalg cimport Vec, VecVec
 import numpy
 import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
 from .. import util
 from ..errors import Errors
@ -79,66 +78,48 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
        A._max_size = n.states
    else:
-        A.token_ids = <int*>realloc(
+        A.token_ids = <int*>realloc(A.token_ids,
-            A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        )
+        A.scores = <float*>realloc(A.scores,
-        A.scores = <float*>realloc(
+                                   n.states * n.classes * sizeof(A.scores[0]))
-            A.scores, n.states * n.classes * sizeof(A.scores[0])
+        A.unmaxed = <float*>realloc(A.unmaxed,
-        )
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.unmaxed = <float*>realloc(
+        A.hiddens = <float*>realloc(A.hiddens,
-            A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        )
+        A.is_valid = <int*>realloc(A.is_valid,
-        A.hiddens = <float*>realloc(
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
            A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
        )
        A.is_valid = <int*>realloc(
            A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
        )
        A._max_size = n.states
    A._curr_size = n.states
-cdef void predict_states(
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
+                         const WeightsC* W, SizesC n) nogil:
 ) nogil:
    resize_activations(A, n)
    for i in range(n.states):
        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
-        cblas,
+                       n.feats, n.hiddens * n.pieces)
        A.unmaxed,
        W.feat_weights,
        A.token_ids,
        n.states,
        n.feats,
        n.hiddens * n.pieces
    )
    for i in range(n.states):
-        VecVec.add_i(
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
-            &A.unmaxed[i*n.hiddens*n.pieces],
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
            W.feat_bias, 1.,
            n.hiddens * n.pieces
        )
        for j in range(n.hiddens):
            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            which = _arg_max(&A.unmaxed[index], n.pieces)
            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
    memset(A.scores, 0, n.states * n.classes * sizeof(float))
    if W.hidden_weights == NULL:
        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
    else:
        # Compute hidden-to-output
-        sgemm(cblas)(
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
-            False, True, n.states, n.classes, n.hiddens,
+                     <const float *>A.hiddens, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
-            <const float *>W.hidden_weights, n.hiddens,
+                     A.scores, n.classes)
            0.0, A.scores, n.classes
        )
        # Add bias
        for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
    # Set unseen classes to minimum value
    i = 0
    min_ = A.scores[0]
@ -151,15 +132,8 @@ cdef void predict_states(
                A.scores[i*n.classes+j] = min_
-cdef void sum_state_features(
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
-    CBlas cblas,
+                             const int* token_ids, int B, int F, int O) nogil:
    float* output,
    const float* cached,
    const int* token_ids,
    int B,
    int F,
    int O
 ) nogil:
    cdef int idx, b, f
    cdef const float* feature
    padding = cached
@ -177,17 +151,13 @@ cdef void sum_state_features(
        token_ids += F
-cdef void cpu_log_loss(
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
-    float* d_scores,
+                       const float* scores, int O) nogil:
    const float* costs,
    const int* is_valid,
    const float* scores,
    int O
 ) nogil:
    """Do multi-label log loss"""
    cdef double max_, gmax, Z, gZ
    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
+    guess = _arg_max(scores, O)
    if best == -1 or guess == -1:
        # These shouldn't happen, but if they do, we want to make sure we don't
        # cause an OOB access.
@ -207,9 +177,8 @@ cdef void cpu_log_loss(
            d_scores[i] = exp(scores[i]-max_) / Z
-cdef int arg_max_if_gold(
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-    const weight_t* scores, const weight_t* costs, const int* is_valid, int n
+                         const int* is_valid, int n) nogil:
 ) nogil:
    # Find minimum cost
    cdef float cost = 1
    for i in range(n):
@ -234,16 +203,8 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
 class ParserStepModel(Model):
-    def __init__(
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-        self,
+                 dropout=0.1):
        docs,
        layers,
        *,
        has_upper,
        unseen_classes=None,
        train=True,
        dropout=0.1
    ):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.attrs["has_upper"] = has_upper
        self.attrs["dropout_rate"] = dropout
@ -304,10 +265,8 @@ class ParserStepModel(Model):
        return ids
    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if (
+        if isinstance(self.state2vec.ops, CupyOps) \
-            isinstance(self.state2vec.ops, CupyOps)
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
        ):
            # Move token_ids and d_vector to GPU, asynchronously
            self.backprops.append((
                util.get_async(self.cuda_stream, token_ids),
@ -350,7 +309,7 @@ def step_forward(model: ParserStepModel, states, is_train):
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores  # no-cython-lint: E731
+        def get_d_vector(d_scores): return d_scores
    # If the class is unseen, make sure its score is minimum
    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
@ -386,6 +345,7 @@ cdef class precompute_hiddens:
    cdef bint _is_synchronized
    cdef public object ops
    cdef public object numpy_ops
    cdef public object _cpu_ops
    cdef np.ndarray _features
    cdef np.ndarray _cached
    cdef np.ndarray bias
@ -416,6 +376,7 @@ cdef class precompute_hiddens:
        self.nO = cached.shape[2]
        self.ops = lower_model.ops
        self.numpy_ops = NumpyOps()
        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
        assert activation in (None, "relu", "maxout")
        self.activation = activation
        self._is_synchronized = False
@ -478,19 +439,13 @@ cdef class precompute_hiddens:
        # - Output from backward on GPU
        bp_hiddens = self._bp_hiddens
-        cdef CBlas cblas
+        cdef CBlas cblas = self._cpu_ops.cblas()
        if isinstance(self.ops, CupyOps):
            cblas = NUMPY_OPS.cblas()
        else:
            cblas = self.ops.cblas()
        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
-        sum_state_features(
+        sum_state_features(cblas, <float*>state_vector.data,
-            cblas, <float*>state_vector.data,
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
-            feat_weights, &ids[0, 0],
+                           self.nF, self.nO*self.nP)
            token_ids.shape[0], self.nF, self.nO*self.nP
        )
        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
@ -531,3 +486,15 @@ cdef class precompute_hiddens:
            return d_best.reshape((d_best.shape + (1,)))
        return state_vector, backprop_relu
 cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
    if n_classes == 2:
        return 0 if scores[0] > scores[1] else 1
    cdef int i
    cdef int best = 0
    cdef float mode = scores[0]
    for i in range(1, n_classes):
        if scores[i] > mode:
            mode = scores[i]
            best = i
    return best
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,27 +1,41 @@
 cimport numpy as np
-from cymem.cymem cimport Pool
+from libc.stdint cimport uint32_t, uint64_t
-from libc.stdint cimport uint64_t
+from libcpp.memory cimport shared_ptr
-from preshed.maps cimport PreshMap
+from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
 from .typedefs cimport attr_t, hash_t
 cdef cppclass Feature:
    hash_t field
    hash_t value
    __init__():
        this.field = 0
        this.value = 0
 cdef cppclass MorphAnalysisC:
    hash_t key  
    vector[Feature] features
    __init__():
        this.key = 0
 cdef class Morphology:
    cdef readonly Pool mem
    cdef readonly StringStore strings
-    cdef PreshMap tags  # Keyed by hash, value is pointer to tag
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
    cdef hash_t _add(self, features)
    cdef str _normalize_features(self, features)
    cdef str get_morph_str(self, hash_t morph_key)
    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
-
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
-cdef list list_features(const MorphAnalysisC* morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
 cdef int get_n_by_field(
    attr_t* results,
    const MorphAnalysisC* morph,
    attr_t field,
 ) nogil
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,14 +1,15 @@
 # cython: infer_types
 # cython: profile=False
 import warnings
 from typing import Dict, List, Optional, Tuple, Union
 import numpy
-from .attrs cimport POS
+from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 from . import symbols
 from .errors import Warnings
 from .parts_of_speech import IDS as POS_IDS
 cdef class Morphology:
@ -26,135 +27,185 @@ cdef class Morphology:
    EMPTY_MORPH = symbols.NAMES[symbols._]
    def __init__(self, StringStore strings):
        self.mem = Pool()
        self.strings = strings
        self.tags = PreshMap()
    def __reduce__(self):
        tags = set([self.get(self.strings[s]) for s in self.strings])
        tags -= set([""])
        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
        match = self.tags.find(tag_hash)
        if match != self.tags.const_end():
            return deref(match).second
        else:
            return shared_ptr[MorphAnalysisC]()
    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
            attr_key = self.strings.as_string(attr_key)
            attr_value = self.strings.as_string(attr_value)
            # Preserve multiple values as a list
            if self.VALUE_SEP in attr_value:
                values = attr_value.split(self.VALUE_SEP)
                values.sort()
                attr_value = values
        else:
            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
            return None
        return attr_key, attr_value
    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
        if not feats or feats == self.EMPTY_MORPH:
            return {}
        out = []
        for feat in feats.split(self.FEATURE_SEP):
            field, values = feat.split(self.FIELD_SEP, 1)
            normalized_attr = self._normalize_attr(field, values)
            if normalized_attr is None:
                continue
            out.append((normalized_attr[0], normalized_attr[1]))
        out.sort(key=lambda x: x[0])
        return dict(out)
    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
        out = []
        for field, values in feats.items():
            normalized_attr = self._normalize_attr(field, values)
            if normalized_attr is None:
                continue
            out.append((normalized_attr[0], normalized_attr[1]))
        out.sort(key=lambda x: x[0])
        return dict(out)
    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
        norm_feats_string = self.FEATURE_SEP.join([
            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
            for field, values in feats.items()
            ])
        return norm_feats_string or self.EMPTY_MORPH
    cdef hash_t _add(self, features):
        """Insert a morphological analysis in the morphology table, if not
        already present. The morphological analysis may be provided in the UD
        FEATS format as a string or in the tag map dict format.
        Returns the hash of the new analysis.
        """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
        cdef shared_ptr[MorphAnalysisC] tag
        if isinstance(features, str):
            if features == "":
                features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
+
-            if tag_ptr != NULL:
+            tag_hash = self.strings[features]
-                return tag_ptr.key
+            tag = self._lookup_tag(tag_hash)
-            features = self.feats_to_dict(features)
+            if tag:
-        if not isinstance(features, dict):
+                return deref(tag).key
            features = self._str_to_normalized_feat_dict(features)
        elif isinstance(features, dict):
            features = self._dict_to_normalized_feat_dict(features)
        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
+
        # intified ("Field", "Field=Value") pairs
        field_feature_pairs = []
        for field in sorted(string_features):
            values = string_features[field]
            for value in values.split(self.VALUE_SEP):
                field_feature_pairs.append((
                    self.strings.add(field),
                    self.strings.add(field + self.FIELD_SEP + value),
                ))
        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
        # the hash key for the tag is either the hash of the normalized UFEATS
        # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
-        tag.key = self.strings.add(norm_feats_string)
+        tag_hash = self.strings.add(norm_feats_string)
-        self.insert(tag)
+        tag = self._lookup_tag(tag_hash)
-        return tag.key
+        if tag:
            return deref(tag).key
-    def normalize_features(self, features):
+        self._intern_morph_tag(tag_hash, features)
        return tag_hash
    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
        # intified ("Field", "Field=Value") pairs where fields with multiple values have
        # been split into individual tuples, e.g.:
        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
        # ("Field2", "Field2=Value3")]
        field_feature_pairs = []
        # Feat dict is normalized at this point.
        for field, values in feats.items():
            field_key = self.strings.add(field)
            if isinstance(values, list):
                for value in values:
                    value_key = self.strings.add(field + self.FIELD_SEP + value)
                    field_feature_pairs.append((field_key, value_key))
            else:
                # We could box scalar values into a list and use a common
                # code path to generate features but that incurs a small
                # but measurable allocation/iteration overhead (as this
                # branch is taken often enough).
                value_key = self.strings.add(field + self.FIELD_SEP + values)
                field_feature_pairs.append((field_key, value_key))
        num_features = len(field_feature_pairs)
        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
        deref(tag).key = tag_key
        deref(tag).features.resize(num_features)
        for i in range(num_features):
            deref(tag).features[i].field = field_feature_pairs[i][0]
            deref(tag).features[i].value = field_feature_pairs[i][1]
        self.tags[tag_key] = tag
    cdef str get_morph_str(self, hash_t morph_key):
        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
        if not tag:
            return ""
        else:
            return self.strings[deref(tag).key]
    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
        return self._lookup_tag(morph_key)
    cdef str _normalize_features(self, features):
        """Create a normalized FEATS string from a features string or dict.
        features (Union[dict, str]): Features as dict or UFEATS string.
        RETURNS (str): Features as normalized UFEATS string.
        """
        if isinstance(features, str):
-            features = self.feats_to_dict(features)
+            features = self._str_to_normalized_feat_dict(features)
-        if not isinstance(features, dict):
+        elif isinstance(features, dict):
            features = self._dict_to_normalized_feat_dict(features)
        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
        features = self.normalize_attrs(features)
        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
        # normalized UFEATS string with sorted fields and values
        norm_feats_string = self.FEATURE_SEP.join(
            sorted(
                [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
            )
        )
        return norm_feats_string or self.EMPTY_MORPH
-    def normalize_attrs(self, attrs):
+        return self._normalized_feat_dict_to_str(features)
        """Convert attrs dict so that POS is always by ID, other features are
        by string. Values separated by VALUE_SEP are sorted.
        """
        out = {}
        attrs = dict(attrs)
        for key, value in attrs.items():
            # convert POS value to ID
            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
                if isinstance(value, str) and value.upper() in POS_IDS:
                    value = POS_IDS[value.upper()]
                elif isinstance(value, int) and value not in POS_IDS.values():
                    warnings.warn(Warnings.W100.format(feature={key: value}))
                    continue
                out[POS] = value
            # accept any string or ID fields and values and convert to strings
            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
                key = self.strings.as_string(key)
                value = self.strings.as_string(value)
                # sort values
                if self.VALUE_SEP in value:
                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
                out[key] = value
            else:
                warnings.warn(Warnings.W100.format(feature={key: value}))
        return out
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
+    def add(self, features):
-        """Creates a MorphAnalysisC from a list of intified
+        return self._add(features)
        ("Field", "Field=Value") tuples where fields with multiple values have
        been split into individual tuples, e.g.:
        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
        ("Field2", "Field2=Value3")]
        """
        cdef MorphAnalysisC tag
        tag.length = len(field_feature_pairs)
        if tag.length > 0:
            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
        for i, (field, feature) in enumerate(field_feature_pairs):
            tag.fields[i] = field
            tag.features[i] = feature
        return tag
-    cdef int insert(self, MorphAnalysisC tag) except -1:
+    def get(self, morph_key):
-        cdef hash_t key = tag.key
+        return self.get_morph_str(morph_key)
        if self.tags.get(key) == NULL:
            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
            tag_ptr[0] = tag
            self.tags.set(key, <void*>tag_ptr)
-    def get(self, hash_t morph):
+    def normalize_features(self, features):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
+        return self._normalize_features(features)
        if tag == NULL:
            return ""
        else:
            return self.strings[tag.key]
    @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
+
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+        out = {}
        for feat in feats.split(Morphology.FEATURE_SEP):
            field, values = feat.split(Morphology.FIELD_SEP, 1)
            if sort_values:
                values = values.split(Morphology.VALUE_SEP)
                values.sort()
                values = Morphology.VALUE_SEP.join(values)
            out[field] = values
        return out
    @staticmethod
    def dict_to_feats(feats_dict):
@ -163,34 +214,34 @@ cdef class Morphology:
        return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
    cdef int i
-    for i in range(morph.length):
+    for i in range(deref(morph).features.size()):
-        if morph.features[i] == feature:
+        if deref(morph).features[i].value == feature:
            return True
    return False
-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
    cdef int i
    features = []
-    for i in range(morph.length):
+    for i in range(deref(morph).features.size()):
-        features.append(morph.features[i])
+        features.append(deref(morph).features[i].value)
    return features
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
    n = get_n_by_field(<uint64_t*>results.data, morph, field)
    return results[:n]
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
    cdef int n_results = 0
    cdef int i
-    for i in range(morph.length):
+    for i in range(deref(morph).features.size()):
-        if morph.fields[i] == field:
+        if deref(morph).features[i].field == field:
-            results[n_results] = morph.features[i]
+            results[n_results] = deref(morph).features[i].value
            n_results += 1
    return n_results
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -4,22 +4,22 @@ from . cimport symbols
 cpdef enum univ_pos_t:
    NO_TAG = 0
    ADJ = symbols.ADJ
-    ADP
+    ADP = symbols.ADP
-    ADV
+    ADV = symbols.ADV
-    AUX
+    AUX = symbols.AUX
-    CONJ
+    CONJ = symbols.CONJ
-    CCONJ  # U20
+    CCONJ = symbols.CCONJ  # U20
-    DET
+    DET = symbols.DET
-    INTJ
+    INTJ = symbols.INTJ
-    NOUN
+    NOUN = symbols.NOUN
-    NUM
+    NUM = symbols.NUM
-    PART
+    PART = symbols.PART
-    PRON
+    PRON = symbols.PRON
-    PROPN
+    PROPN = symbols.PROPN
-    PUNCT
+    PUNCT = symbols.PUNCT
-    SCONJ
+    SCONJ = symbols.SCONJ
-    SYM
+    SYM = symbols.SYM
-    VERB
+    VERB = symbols.VERB
-    X
+    X = symbols.X
-    EOL
+    EOL = symbols.EOL
-    SPACE
+    SPACE = symbols.SPACE
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,8 +1,7 @@
-from .attributeruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .entityruler import EntityRuler
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
@ -25,7 +24,6 @@ __all__ = [
    "EditTreeLemmatizer",
    "EntityLinker",
    "EntityRecognizer",
    "EntityRuler",
    "Morphologizer",
    "Lemmatizer",
    "MultiLabel_TextCategorizer",
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@ -1,7 +1,7 @@
 from ...typedefs cimport class_t, hash_t
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 cdef int check_final_state(void* _state, void* extra_args) except -1
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@ -1,21 +1,18 @@
 # cython: infer_types=True
 import numpy
 from thinc.extra.search cimport Beam
 from thinc.extra.search import MaxViolation
 from thinc.extra.search cimport MaxViolation
 from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 from ...errors import Errors
 from .search cimport Beam, MaxViolation
 from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateC*>_dest
    src = <StateC*>_src
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -19,7 +19,7 @@ from .stateclass cimport StateClass
 from ...errors import Errors
-from thinc.extra.search cimport Beam
+from .search cimport Beam
 cdef weight_t MIN_SCORE = -90000
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -1,11 +1,10 @@
 # cython: profile=False
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 from collections import Counter
 from thinc.extra.search cimport Beam
 from ...tokens.doc cimport Doc
 from ...tokens.span import Span
@ -20,6 +19,7 @@ from ...training import split_bilu_label
 from ...training.example cimport Example
 from ._state cimport StateC
 from .search cimport Beam
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
@ -47,9 +47,7 @@ MOVE_NAMES[OUT] = 'O'
 cdef struct GoldNERStateC:
    Transition* ner
-    SpanC* negs
+    vector[shared_ptr[SpanC]] negs
    int32_t length
    int32_t nr_neg
 cdef class BiluoGold:
@ -82,8 +80,6 @@ cdef GoldNERStateC create_gold_state(
        negs = []
    assert example.x.length > 0
    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
    gs.nr_neg = len(negs)
    ner_ents, ner_tags = example.get_aligned_ents_and_ner()
    for i, ner_tag in enumerate(ner_tags):
        gs.ner[i] = moves.lookup_transition(ner_tag)
@ -97,8 +93,8 @@ cdef GoldNERStateC create_gold_state(
    # In order to handle negative samples, we need to maintain the full
    # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
    # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
+    for neg in negs:
-        gs.negs[i] = neg.c
+        gs.negs.push_back(neg.c)
    return gs
@ -413,6 +409,8 @@ cdef class Begin:
        cdef int g_act = gold.ner[b0].move
        cdef attr_t g_tag = gold.ner[b0].label
        cdef shared_ptr[SpanC] span
        if g_act == MISSING:
            pass
        elif g_act == BEGIN:
@ -430,8 +428,8 @@ cdef class Begin:
            # be correct or not. However, we can at least tell whether we're
            # going to be opening an entity where there's only one possible
            # L.
-            for span in gold.negs[:gold.nr_neg]:
+            for span in gold.negs:
-                if span.label == label and span.start == b0:
+                if span.get().label == label and span.get().start == b0:
                    cost += 1
                    break
        return cost
@ -572,8 +570,9 @@ cdef class Last:
        # If we have negative-example entities, integrate them into the objective,
        # by marking actions that close an entity that we know is incorrect
        # as costly.
-        for span in gold.negs[:gold.nr_neg]:
+        cdef shared_ptr[SpanC] span
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        for span in gold.negs:
            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                cost += 1
                break
        return cost
@ -637,8 +636,9 @@ cdef class Unit:
        # This is fairly straight-forward for U- entities, as we have a single
        # action
        cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
+        cdef shared_ptr[SpanC] span
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        for span in gold.negs:
            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                cost += 1
                break
        return cost
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@ -0,0 +1,86 @@
 from cymem.cymem cimport Pool
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
 from ...typedefs cimport class_t, hash_t, weight_t
 ctypedef pair[weight_t, size_t] Entry
 ctypedef priority_queue[Entry] Queue
 ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
 ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
 ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
 ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
 ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
 cdef struct _State:
    void* content
    class_t* hist
    weight_t score
    weight_t loss
    int i
    int t
    bint is_done
 cdef class Beam:
    cdef Pool mem
    cdef class_t nr_class
    cdef class_t width
    cdef class_t size
    cdef public weight_t min_density
    cdef int t
    cdef readonly bint is_done
    cdef list histories
    cdef list _parent_histories
    cdef weight_t** scores
    cdef int** is_valid
    cdef weight_t** costs
    cdef _State* _parents
    cdef _State* _states
    cdef del_func_t del_func
    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
    cdef inline void* at(self, int i) nogil:
        return self._states[i].content
    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
                     void* extra_args) except -1
    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
        self.scores[i][j] = score
        self.is_valid[i][j] = is_valid
        self.costs[i][j] = cost
    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
                     const weight_t* costs) except -1
    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
 cdef class MaxViolation:
    cdef Pool mem
    cdef weight_t cost
    cdef weight_t delta
    cdef readonly weight_t p_score
    cdef readonly weight_t g_score
    cdef readonly double Z
    cdef readonly double gZ
    cdef class_t n
    cdef readonly list p_hist
    cdef readonly list g_hist
    cdef readonly list p_probs
    cdef readonly list g_probs
    cpdef int check(self, Beam pred, Beam gold) except -1
    cpdef int check_crf(self, Beam pred, Beam gold) except -1
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@ -0,0 +1,303 @@
 # cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from cymem.cymem cimport Pool
 from libc.math cimport exp
 from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
 cdef class Beam:
    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
        assert nr_class != 0
        assert width != 0
        self.nr_class = nr_class
        self.width = width
        self.min_density = min_density
        self.size = 1
        self.t = 0
        self.mem = Pool()
        self.del_func = NULL
        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
        cdef int i
        self.histories = [[] for i in range(self.width)]
        self._parent_histories = [[] for i in range(self.width)]
        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
        for i in range(self.width):
            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
    def __len__(self):
        return self.size
    property score:
        def __get__(self):
            return self._states[0].score
    property min_score:
        def __get__(self):
            return self._states[self.size-1].score
    property loss:
        def __get__(self):
            return self._states[0].loss
    property probs:
        def __get__(self):
            return _softmax([self._states[i].score for i in range(self.size)])
    property scores:
        def __get__(self):
            return [self._states[i].score for i in range(self.size)]
    property histories:
        def __get__(self):
            return self.histories
    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
                     const weight_t* costs) except -1:
        cdef int j
        for j in range(self.nr_class):
            self.scores[i][j] = scores[j]
            self.is_valid[i][j] = is_valid[j]
            self.costs[i][j] = costs[j]
    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
        cdef int i
        for i in range(self.width):
            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
        for i in range(self.width):
            self._states[i].content = init_func(self.mem, n, extra_args)
            self._parents[i].content = init_func(self.mem, n, extra_args)
        self.del_func = del_func
    def __dealloc__(self):
        if self.del_func == NULL:
            return
        for i in range(self.width):
            self.del_func(self.mem, self._states[i].content, NULL)
            self.del_func(self.mem, self._parents[i].content, NULL)
    @cython.cdivision(True)
    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
                     void* extra_args) except -1:
        cdef weight_t** scores = self.scores
        cdef int** is_valid = self.is_valid
        cdef weight_t** costs = self.costs
        cdef Queue* q = new Queue()
        self._fill(q, scores, is_valid)
        # For a beam of width k, we only ever need 2k state objects. How?
        # Each transition takes a parent and a class and produces a new state.
        # So, we don't need the whole history --- just the parent. So at
        # each step, we take a parent, and apply one or more extensions to
        # it.
        self._parents, self._states = self._states, self._parents
        self._parent_histories, self.histories = self.histories, self._parent_histories
        cdef weight_t score
        cdef int p_i
        cdef int i = 0
        cdef class_t clas
        cdef _State* parent
        cdef _State* state
        cdef hash_t key
        cdef PreshMap seen_states = PreshMap(self.width)
        cdef uint64_t is_seen
        cdef uint64_t one = 1
        while i < self.width and not q.empty():
            data = q.top()
            p_i = data.second / self.nr_class
            clas = data.second % self.nr_class
            score = data.first
            q.pop()
            parent = &self._parents[p_i]
            # Indicates terminal state reached; i.e. state is done
            if parent.is_done:
                # Now parent will not be changed, so we don't have to copy.
                # Once finished, should also be unbranching.
                self._states[i], parent[0] = parent[0], self._states[i]
                parent.i = self._states[i].i
                parent.t = self._states[i].t
                parent.is_done = self._states[i].t
                self._states[i].score = score
                self.histories[i] = list(self._parent_histories[p_i])
                i += 1
            else:
                state = &self._states[i]
                # The supplied transition function should adjust the destination
                # state to be the result of applying the class to the source state
                transition_func(state.content, parent.content, clas, extra_args)
                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
                is_seen = <uint64_t>seen_states.get(key)
                if key == 0 or key == 1 or not is_seen:
                    if key != 0 and key != 1:
                        seen_states.set(key, <void*>one)
                    state.score = score
                    state.loss = parent.loss + costs[p_i][clas]
                    self.histories[i] = list(self._parent_histories[p_i])
                    self.histories[i].append(clas)
                    i += 1
        del q
        self.size = i
        assert self.size >= 1
        for i in range(self.width):
            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
        self.t += 1
    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
        cdef int i
        for i in range(self.size):
            if not self._states[i].is_done:
                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
        for i in range(self.size):
            if not self._states[i].is_done:
                self.is_done = False
                break
        else:
            self.is_done = True
    @cython.cdivision(True)
    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
        """Populate the queue from a k * n matrix of scores, where k is the
        beam-width, and n is the number of classes.
        """
        cdef Entry entry
        cdef _State* s
        cdef int i, j, move_id
        assert self.size >= 1
        cdef vector[Entry] entries
        for i in range(self.size):
            s = &self._states[i]
            move_id = i * self.nr_class
            if s.is_done:
                # Update score by path average, following TACL '13 paper.
                if self.histories[i]:
                    entry.first = s.score + (s.score / self.t)
                else:
                    entry.first = s.score
                entry.second = move_id
                entries.push_back(entry)
            else:
                for j in range(self.nr_class):
                    if is_valid[i][j]:
                        entry.first = s.score + scores[i][j]
                        entry.second = move_id + j
                        entries.push_back(entry)
        cdef double max_, Z, cutoff
        if self.min_density == 0.0:
            for i in range(entries.size()):
                q.push(entries[i])
        elif not entries.empty():
            max_ = entries[0].first
            Z = 0.
            cutoff = 0.
            # Softmax into probabilities, so we can prune
            for i in range(entries.size()):
                if entries[i].first > max_:
                    max_ = entries[i].first
            for i in range(entries.size()):
                Z += exp(entries[i].first-max_)
            cutoff = (1. / Z) * self.min_density
            for i in range(entries.size()):
                prob = exp(entries[i].first-max_) / Z
                if prob >= cutoff:
                    q.push(entries[i])
 cdef class MaxViolation:
    def __init__(self):
        self.p_score = 0.0
        self.g_score = 0.0
        self.Z = 0.0
        self.gZ = 0.0
        self.delta = -1
        self.cost = 0
        self.p_hist = []
        self.g_hist = []
        self.p_probs = []
        self.g_probs = []
    cpdef int check(self, Beam pred, Beam gold) except -1:
        cdef _State* p = &pred._states[0]
        cdef _State* g = &gold._states[0]
        cdef weight_t d = p.score - g.score
        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
            self.cost = p.loss
            self.delta = d
            self.p_hist = list(pred.histories[0])
            self.g_hist = list(gold.histories[0])
            self.p_score = p.score
            self.g_score = g.score
            self.Z = 1e-10
            self.gZ = 1e-10
            for i in range(pred.size):
                if pred._states[i].loss > 0:
                    self.Z += exp(pred._states[i].score)
            for i in range(gold.size):
                if gold._states[i].loss == 0:
                    prob = exp(gold._states[i].score)
                    self.Z += prob
                    self.gZ += prob
    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
        d = pred.score - gold.score
        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
            p_hist = []
            p_scores = []
            g_hist = []
            g_scores = []
            for i in range(pred.size):
                if pred._states[i].loss > 0:
                    p_scores.append(pred._states[i].score)
                    p_hist.append(list(pred.histories[i]))
                # This can happen from non-monotonic actions
                # If we find a better gold analysis this way, be sure to keep it.
                elif pred._states[i].loss <= 0 \
                        and tuple(pred.histories[i]) not in seen_golds:
                    g_scores.append(pred._states[i].score)
                    g_hist.append(list(pred.histories[i]))
            for i in range(gold.size):
                if gold._states[i].loss == 0:
                    g_scores.append(gold._states[i].score)
                    g_hist.append(list(gold.histories[i]))
            all_probs = _softmax(p_scores + g_scores)
            p_probs = all_probs[:len(p_scores)]
            g_probs_all = all_probs[len(p_scores):]
            g_probs = _softmax(g_scores)
            self.cost = pred.loss
            self.delta = d
            self.p_hist = p_hist
            self.g_hist = g_hist
            # TODO: These variables are misnamed! These are the gradients of the loss.
            self.p_probs = p_probs
            # Intuition here:
            # The gradient of the loss is:
            # P(model) - P(truth)
            # Normally, P(truth) is 1 for the gold
            # But, if we want to do the "partial credit" scheme, we want
            # to create a distribution over the gold, proportional to the scores
            # awarded.
            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
 def _softmax(nums):
    if not nums:
        return []
    max_ = max(nums)
    nums = [(exp(n-max_) if n is not None else None) for n in nums]
    Z = sum(n for n in nums if n is not None)
    return [(n/Z if n is not None else None) for n in nums]
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -9,7 +9,6 @@ from collections import Counter
 import srsly
 from ...structs cimport TokenC
 from ...typedefs cimport attr_t, weight_t
 from .stateclass cimport StateClass
 from ... import util
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@ -10,7 +10,7 @@ from ..matcher import Matcher
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
-from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
+from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
 from ..training import Example
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -14,8 +14,11 @@ from ..scorer import Scorer
 from ..training import remove_bilu_prefix
 from ..util import registry
 from ._parser_internals import nonproj
 from ._parser_internals.arc_eager import ArcEager
 from ._parser_internals.nonproj import DELIMITER
 from ._parser_internals.transition_system import TransitionSystem
 from .functions import merge_subtokens
 from .transition_parser import Parser
 default_model_config = """
 [model]
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -1,11 +1,11 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
 import numpy as np
 import srsly
 from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats2d, Ints1d
 from .. import util
 from ..errors import Errors
@ -18,6 +18,7 @@ from ._edit_tree_internals.schemas import validate_edit_tree
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 # The cutoff value of *top_k* above which an alternative method is used to process guesses.
 TOP_K_GUARDRAIL = 20
@ -50,6 +51,7 @@ DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["mo
        "overwrite": False,
        "top_k": 1,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
        "save_activations": False,
    },
    default_score_weights={"lemma_acc": 1.0},
 )
@ -62,6 +64,7 @@ def make_edit_tree_lemmatizer(
    overwrite: bool,
    top_k: int,
    scorer: Optional[Callable],
    save_activations: bool,
 ):
    """Construct an EditTreeLemmatizer component."""
    return EditTreeLemmatizer(
@ -73,6 +76,7 @@ def make_edit_tree_lemmatizer(
        overwrite=overwrite,
        top_k=top_k,
        scorer=scorer,
        save_activations=save_activations,
    )
@ -92,6 +96,7 @@ class EditTreeLemmatizer(TrainablePipe):
        overwrite: bool = False,
        top_k: int = 1,
        scorer: Optional[Callable] = lemmatizer_score,
        save_activations: bool = False,
    ):
        """
        Construct an edit tree lemmatizer.
@ -103,6 +108,7 @@ class EditTreeLemmatizer(TrainablePipe):
            frequency in the training data.
        overwrite (bool): overwrite existing lemma annotations.
        top_k (int): try to apply at most the k most probable edit trees.
        save_activations (bool): save model activations in Doc when annotating.
        """
        self.vocab = vocab
        self.model = model
@ -117,6 +123,7 @@ class EditTreeLemmatizer(TrainablePipe):
        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer
        self.save_activations = save_activations
        self.numpy_ops = NumpyOps()
    def get_loss(
@ -146,7 +153,26 @@ class EditTreeLemmatizer(TrainablePipe):
        return float(loss), d_scores
-    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+    def get_teacher_student_loss(
        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
    ) -> Tuple[float, List[Floats2d]]:
        """Calculate the loss and its gradient for a batch of student
        scores, relative to teacher scores.
        teacher_scores: Scores representing the teacher model's predictions.
        student_scores: Scores representing the student model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
        """
        loss_func = SequenceCategoricalCrossentropy(normalize=False)
        d_scores, loss = loss_func(student_scores, teacher_scores)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores
    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        if self.top_k == 1:
            scores2guesses = self._scores2guesses_top_k_equals_1
        elif self.top_k <= TOP_K_GUARDRAIL:
@ -163,14 +189,19 @@ class EditTreeLemmatizer(TrainablePipe):
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+            guesses: List[Ints1d] = [
                self.model.ops.alloc((0,), dtype="i") for doc in docs
            ]
            scores: List[Floats2d] = [
                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
            ]
            assert len(guesses) == n_docs
-            return guesses
+            return {"probabilities": scores, "tree_ids": guesses}
        scores = self.model.predict(docs)
        assert len(scores) == n_docs
        guesses = scores2guesses(docs, scores)
        assert len(guesses) == n_docs
-        return guesses
+        return {"probabilities": scores, "tree_ids": guesses}
    def _scores2guesses_top_k_equals_1(self, docs, scores):
        guesses = []
@ -230,8 +261,13 @@ class EditTreeLemmatizer(TrainablePipe):
        return guesses
-    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        batch_tree_ids = activations["tree_ids"]
        for i, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                for act_name, acts in activations.items():
                    doc.activations[self.name][act_name] = acts[i]
            doc_tree_ids = batch_tree_ids[i]
            if hasattr(doc_tree_ids, "get"):
                doc_tree_ids = doc_tree_ids.get()
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,27 +1,40 @@
 import random
-from itertools import islice
+import warnings
 from itertools import islice, tee
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Sequence,
    Union,
    cast,
 )
 import srsly
 from numpy import dtype
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..scorer import Scorer
-from ..tokens import Doc, Span
+from ..tokens import Doc, Span, SpanGroup
 from ..training import Example, validate_examples, validate_get_examples
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .legacy.entity_linker import EntityLinker_v1
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
-# See #9050
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
-BACKWARD_OVERWRITE = True
+
 KNOWLEDGE_BASE_IDS = "kb_ids"
 default_model_config = """
 [model]
@ -51,14 +64,13 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_prior": True,
        "incl_context": True,
        "entity_vector_length": 64,
-        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+        "get_candidates": {"@misc": "spacy.CandidateGenerator.v2"},
-        "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
+        "overwrite": False,
        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
        "overwrite": True,
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
        "candidates_batch_size": 1,
        "threshold": None,
        "save_activations": False,
    },
    default_score_weights={
        "nel_micro_f": 1.0,
@ -76,16 +88,15 @@ def make_entity_linker(
    incl_prior: bool,
    incl_context: bool,
    entity_vector_length: int,
-    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+    get_candidates: Callable[
-    get_candidates_batch: Callable[
+        [KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]
        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
    ],
    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
    overwrite: bool,
    scorer: Optional[Callable],
    use_gold_ents: bool,
    candidates_batch_size: int,
    threshold: Optional[float] = None,
    save_activations: bool,
 ):
    """Construct an EntityLinker component.
@ -97,35 +108,20 @@ def make_entity_linker(
    incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
    incl_context (bool): Whether or not to include the local context in the model.
    entity_vector_length (int): Size of encoding vectors in the KB.
-    get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
+    get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
-        produces a list of candidates, given a certain knowledge base and a textual mention.
+        Function producing a list of candidates per document, given a certain knowledge base and several textual
-    get_candidates_batch (
+        documents with textual mentions.
        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
        ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
    scorer (Optional[Callable]): The scoring method.
    use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
        component must provide entity annotations.
    candidates_batch_size (int): Size of batches for entity candidate generation.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
        prediction is discarded. If None, predictions are not filtered by any threshold.
    save_activations (bool): save model activations in Doc when annotating.
    """
    if not model.attrs.get("include_span_maker", False):
-        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
+        raise ValueError(Errors.E4005)
-        return EntityLinker_v1(
+
            nlp.vocab,
            model,
            name,
            labels_discard=labels_discard,
            n_sents=n_sents,
            incl_prior=incl_prior,
            incl_context=incl_context,
            entity_vector_length=entity_vector_length,
            get_candidates=get_candidates,
            overwrite=overwrite,
            scorer=scorer,
        )
    return EntityLinker(
        nlp.vocab,
        model,
@ -136,13 +132,12 @@ def make_entity_linker(
        incl_context=incl_context,
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
        get_candidates_batch=get_candidates_batch,
        generate_empty_kb=generate_empty_kb,
        overwrite=overwrite,
        scorer=scorer,
        use_gold_ents=use_gold_ents,
        candidates_batch_size=candidates_batch_size,
        threshold=threshold,
        save_activations=save_activations,
    )
@ -174,16 +169,16 @@ class EntityLinker(TrainablePipe):
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+        get_candidates: Callable[
-        get_candidates_batch: Callable[
+            [KnowledgeBase, Iterator[SpanGroup]],
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+            Iterator[Iterable[Iterable[Candidate]]],
        ],
        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
        candidates_batch_size: int,
        threshold: Optional[float] = None,
        save_activations: bool = False,
    ) -> None:
        """Initialize an entity linker.
@ -196,19 +191,17 @@ class EntityLinker(TrainablePipe):
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
+        get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
-            produces a list of candidates, given a certain knowledge base and a textual mention.
+            Function producing a list of candidates per document, given a certain knowledge base and several textual
-        get_candidates_batch (
+            documents with textual mentions.
            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
            Iterable[Candidate]]
            ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
        overwrite (bool): Whether to overwrite existing non-empty annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
        candidates_batch_size (int): Size of batches for entity candidate generation.
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/entitylinker#init
        """
@ -230,16 +223,15 @@ class EntityLinker(TrainablePipe):
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
        self.get_candidates_batch = get_candidates_batch
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
        self.use_gold_ents = use_gold_ents
        self.candidates_batch_size = candidates_batch_size
        self.threshold = threshold
        self.save_activations = save_activations
-        if candidates_batch_size < 1:
+        if self.incl_prior and not self.kb.supports_prior_probs:
-            raise ValueError(Errors.E1044)
+            warnings.warn(Warnings.W401)
        def _score_with_ents_set(examples: Iterable[Example], **kwargs):
            # Because of how spaCy works, we can't just score immediately, because Language.evaluate
@ -346,11 +338,12 @@ class EntityLinker(TrainablePipe):
        If one isn't present, then the update step needs to be skipped.
        """
-
+        for candidates_for_doc in self.get_candidates(
-        for eg in examples:
+            self.kb,
-            for ent in eg.predicted.ents:
+            (SpanGroup(doc=eg.predicted, spans=eg.predicted.ents) for eg in examples),
-                candidates = list(self.get_candidates(self.kb, ent))
+        ):
-                if candidates:
+            for candidates_for_mention in candidates_for_doc:
                if list(candidates_for_mention):
                    return True
        return False
@ -442,7 +435,7 @@ class EntityLinker(TrainablePipe):
        loss = loss / len(entity_encodings)
        return float(loss), out
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.
@ -455,40 +448,47 @@ class EntityLinker(TrainablePipe):
        self.validate_kb()
        entity_count = 0
        final_kb_ids: List[str] = []
-        xp = self.model.ops.xp
+        ops = self.model.ops
        xp = ops.xp
        docs_ents: List[Ragged] = []
        docs_scores: List[Ragged] = []
        if not docs:
-            return final_kb_ids
+            return {
                KNOWLEDGE_BASE_IDS: final_kb_ids,
                "ents": docs_ents,
                "scores": docs_scores,
            }
        if isinstance(docs, Doc):
            docs = [docs]
-        for i, doc in enumerate(docs):
+
-            if len(doc) == 0:
+        docs_iters = tee(docs, 2)
        # Call candidate generator.
        all_ent_cands = self.get_candidates(
            self.kb,
            (
                SpanGroup(
                    doc,
                    spans=[
                        ent for ent in doc.ents if ent.label_ not in self.labels_discard
                    ],
                )
                for doc in docs_iters[0]
            ),
        )
        for doc in docs_iters[1]:
            doc_ents: List[Ints1d] = []
            doc_scores: List[Floats1d] = []
            if len(doc) == 0 or len(doc.ents) == 0:
                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                continue
            sentences = [s for s in doc.sents]
            doc_ent_cands = list(next(all_ent_cands))
-            # Loop over entities in batches.
+            # Looping over candidate entities for this doc. (TODO: rewrite)
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+            for ent_cand_idx, ent in enumerate(doc.ents):
                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
                # Look up candidate entities.
                valid_ent_idx = [
                    idx
                    for idx in range(len(ent_batch))
                    if ent_batch[idx].label_ not in self.labels_discard
                ]
                batch_candidates = list(
                    self.get_candidates_batch(
                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
                    )
                    if self.candidates_batch_size > 1
                    else [
                        self.get_candidates(self.kb, ent_batch[idx])
                        for idx in valid_ent_idx
                    ]
                )
                # Looping through each entity in batch (TODO: rewrite)
                for j, ent in enumerate(ent_batch):
                assert hasattr(ent, "sents")
                sents = list(ent.sents)
                sent_indices = (
@ -506,7 +506,6 @@ class EntityLinker(TrainablePipe):
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    sent_doc = doc[start_token:end_token].as_doc()
                    # currently, the context is the same for each entity in a sentence (should be refined)
                    sentence_encoding = self.model.predict([sent_doc])[0]
                    sentence_encoding_t = sentence_encoding.T
@ -515,21 +514,41 @@ class EntityLinker(TrainablePipe):
                if ent.label_ in self.labels_discard:
                    # ignoring this entity - setting to NIL
                    final_kb_ids.append(self.NIL)
                    self._add_activations(
                        doc_scores=doc_scores,
                        doc_ents=doc_ents,
                        scores=[0.0],
                        ents=[0],
                    )
                else:
-                        candidates = list(batch_candidates[j])
+                    candidates = list(doc_ent_cands[ent_cand_idx])
                    if not candidates:
                        # no prediction possible for this entity - setting to NIL
                        final_kb_ids.append(self.NIL)
                        self._add_activations(
                            doc_scores=doc_scores,
                            doc_ents=doc_ents,
                            scores=[0.0],
                            ents=[0],
                        )
                    elif len(candidates) == 1 and self.threshold is None:
                        # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
+                        final_kb_ids.append(candidates[0].entity_id_)
                        self._add_activations(
                            doc_scores=doc_scores,
                            doc_ents=doc_ents,
                            scores=[1.0],
                            ents=[candidates[0].entity_id],
                        )
                    else:
                        random.shuffle(candidates)
                        # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        scores = prior_probs = xp.asarray(
-                            if not self.incl_prior:
+                            [
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
+                                c.prior_prob if self.incl_prior else 0.0
-                            scores = prior_probs
+                                for c in candidates
                            ]
                        )
                        # add in similarity from the context
                        if self.incl_context:
                            entity_encodings = xp.asarray(
@ -551,33 +570,58 @@ class EntityLinker(TrainablePipe):
                                raise ValueError(Errors.E161)
                            scores = prior_probs + sims - (prior_probs * sims)
                        final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_
+                            candidates[scores.argmax().item()].entity_id_
-                                if self.threshold is None
+                            if self.threshold is None or scores.max() >= self.threshold
                                or scores.max() >= self.threshold
                            else EntityLinker.NIL
                        )
                        self._add_activations(
                            doc_scores=doc_scores,
                            doc_ents=doc_ents,
                            scores=scores,
                            ents=[c.entity_id for c in candidates],
                        )
            self._add_doc_activations(
                docs_scores=docs_scores,
                docs_ents=docs_ents,
                doc_scores=doc_scores,
                doc_ents=doc_ents,
            )
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
            )
            raise RuntimeError(err)
        return final_kb_ids
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+        return {
            KNOWLEDGE_BASE_IDS: final_kb_ids,
            "ents": docs_ents,
            "scores": docs_scores,
        }
    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
                                 by EntityLinker.predict.
        DOCS: https://spacy.io/api/entitylinker#set_annotations
        """
        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
-        for doc in docs:
+        for j, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                for act_name, acts in activations.items():
                    if act_name != KNOWLEDGE_BASE_IDS:
                        # We only copy activations that are Ragged.
                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
@ -676,3 +720,32 @@ class EntityLinker(TrainablePipe):
    def add_label(self, label):
        raise NotImplementedError
    def _add_doc_activations(
        self,
        *,
        docs_scores: List[Ragged],
        docs_ents: List[Ragged],
        doc_scores: List[Floats1d],
        doc_ents: List[Ints1d],
    ):
        if not self.save_activations:
            return
        ops = self.model.ops
        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
    def _add_activations(
        self,
        *,
        doc_scores: List[Floats1d],
        doc_ents: List[Ints1d],
        scores: Sequence[float],
        ents: Sequence[int],
    ):
        if not self.save_activations:
            return
        ops = self.model.ops
        doc_scores.append(ops.asarray1f(scores))
        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -1,541 +0,0 @@
 import warnings
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
 import srsly
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
 from ..scorer import get_ner_prf
 from ..tokens import Doc, Span
 from ..training import Example
 from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
 from .pipe import Pipe
 DEFAULT_ENT_ID_SEP = "||"
 PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@Language.factory(
    "entity_ruler",
    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
    default_config={
        "phrase_matcher_attr": None,
        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
        "validate": False,
        "overwrite_ents": False,
        "ent_id_sep": DEFAULT_ENT_ID_SEP,
        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
    },
    default_score_weights={
        "ents_f": 1.0,
        "ents_p": 0.0,
        "ents_r": 0.0,
        "ents_per_type": None,
    },
 )
 def make_entity_ruler(
    nlp: Language,
    name: str,
    phrase_matcher_attr: Optional[Union[int, str]],
    matcher_fuzzy_compare: Callable,
    validate: bool,
    overwrite_ents: bool,
    ent_id_sep: str,
    scorer: Optional[Callable],
 ):
    return EntityRuler(
        nlp,
        name,
        phrase_matcher_attr=phrase_matcher_attr,
        matcher_fuzzy_compare=matcher_fuzzy_compare,
        validate=validate,
        overwrite_ents=overwrite_ents,
        ent_id_sep=ent_id_sep,
        scorer=scorer,
    )
 def entity_ruler_score(examples, **kwargs):
    return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
 def make_entity_ruler_scorer():
    return entity_ruler_score
 class EntityRuler(Pipe):
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
    rules or exact phrase matches. It can be combined with the statistical
    `EntityRecognizer` to boost accuracy, or used on its own to implement a
    purely rule-based entity recognition system. After initialization, the
    component is typically added to the pipeline using `nlp.add_pipe`.
    DOCS: https://spacy.io/api/entityruler
    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
    """
    def __init__(
        self,
        nlp: Language,
        name: str = "entity_ruler",
        *,
        phrase_matcher_attr: Optional[Union[int, str]] = None,
        matcher_fuzzy_compare: Callable = levenshtein_compare,
        validate: bool = False,
        overwrite_ents: bool = False,
        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
        patterns: Optional[List[PatternType]] = None,
        scorer: Optional[Callable] = entity_ruler_score,
    ) -> None:
        """Initialize the entity ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
        key. A pattern can either be a token pattern (list) or a phrase pattern
        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
        nlp (Language): The shared nlp object to pass the vocab to the matchers
            and process phrase patterns.
        name (str): Instance name of the current pipeline component. Typically
            passed in automatically from the factory when the component is
            added. Used to disable the current entity ruler while creating
            phrase patterns with the nlp object.
        phrase_matcher_attr (int / str): Token attribute to match on, passed
            to the internal PhraseMatcher as `attr`.
        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
            internal Matcher. Defaults to
            spacy.matcher.levenshtein.levenshtein_compare.
        validate (bool): Whether patterns should be validated, passed to
            Matcher and PhraseMatcher as `validate`
        patterns (iterable): Optional patterns to load in.
        overwrite_ents (bool): If existing entities are present, e.g. entities
            added by the model, overwrite them by matches if necessary.
        ent_id_sep (str): Separator used internally for entity IDs.
        scorer (Optional[Callable]): The scoring method. Defaults to
            spacy.scorer.get_ner_prf.
        DOCS: https://spacy.io/api/entityruler#init
        """
        self.nlp = nlp
        self.name = name
        self.overwrite = overwrite_ents
        self.token_patterns = defaultdict(list)  # type: ignore
        self.phrase_patterns = defaultdict(list)  # type: ignore
        self._validate = validate
        self.matcher_fuzzy_compare = matcher_fuzzy_compare
        self.matcher = Matcher(
            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
        )
        self.phrase_matcher_attr = phrase_matcher_attr
        self.phrase_matcher = PhraseMatcher(
            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
        )
        self.ent_id_sep = ent_id_sep
        self._ent_ids = defaultdict(tuple)  # type: ignore
        if patterns is not None:
            self.add_patterns(patterns)
        self.scorer = scorer
    def __len__(self) -> int:
        """The number of all patterns added to the entity ruler."""
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
        return n_token_patterns + n_phrase_patterns
    def __contains__(self, label: str) -> bool:
        """Whether a label is present in the patterns."""
        return label in self.token_patterns or label in self.phrase_patterns
    def __call__(self, doc: Doc) -> Doc:
        """Find matches in document and add them as entities.
        doc (Doc): The Doc object in the pipeline.
        RETURNS (Doc): The Doc with added entities, if available.
        DOCS: https://spacy.io/api/entityruler#call
        """
        error_handler = self.get_error_handler()
        try:
            matches = self.match(doc)
            self.set_annotations(doc, matches)
            return doc
        except Exception as e:
            return error_handler(self.name, self, [doc], e)
    def match(self, doc: Doc):
        self._require_patterns()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="\\[W036")
            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        final_matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
        )
        get_sort_key = lambda m: (m[2] - m[1], -m[1])
        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
        return final_matches
    def set_annotations(self, doc, matches):
        """Modify the document in place"""
        entities = list(doc.ents)
        new_entities = []
        seen_tokens = set()
        for match_id, start, end in matches:
            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
                continue
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                if match_id in self._ent_ids:
                    label, ent_id = self._ent_ids[match_id]
                    span = Span(doc, start, end, label=label, span_id=ent_id)
                else:
                    span = Span(doc, start, end, label=match_id)
                new_entities.append(span)
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))
        doc.ents = entities + new_entities
    @property
    def labels(self) -> Tuple[str, ...]:
        """All labels present in the match patterns.
        RETURNS (set): The string labels.
        DOCS: https://spacy.io/api/entityruler#labels
        """
        keys = set(self.token_patterns.keys())
        keys.update(self.phrase_patterns.keys())
        all_labels = set()
        for l in keys:
            if self.ent_id_sep in l:
                label, _ = self._split_label(l)
                all_labels.add(label)
            else:
                all_labels.add(l)
        return tuple(sorted(all_labels))
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        patterns: Optional[Sequence[PatternType]] = None,
    ):
        """Initialize the pipe for training.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        patterns Optional[Iterable[PatternType]]: The list of patterns.
        DOCS: https://spacy.io/api/entityruler#initialize
        """
        self.clear()
        if patterns:
            self.add_patterns(patterns)  # type: ignore[arg-type]
    @property
    def ent_ids(self) -> Tuple[Optional[str], ...]:
        """All entity ids present in the match patterns `id` properties
        RETURNS (set): The string entity ids.
        DOCS: https://spacy.io/api/entityruler#ent_ids
        """
        keys = set(self.token_patterns.keys())
        keys.update(self.phrase_patterns.keys())
        all_ent_ids = set()
        for l in keys:
            if self.ent_id_sep in l:
                _, ent_id = self._split_label(l)
                all_ent_ids.add(ent_id)
        return tuple(all_ent_ids)
    @property
    def patterns(self) -> List[PatternType]:
        """Get all patterns that were added to the entity ruler.
        RETURNS (list): The original patterns, one dictionary per pattern.
        DOCS: https://spacy.io/api/entityruler#patterns
        """
        all_patterns = []
        for label, patterns in self.token_patterns.items():
            for pattern in patterns:
                ent_label, ent_id = self._split_label(label)
                p = {"label": ent_label, "pattern": pattern}
                if ent_id:
                    p["id"] = ent_id
                all_patterns.append(p)
        for label, patterns in self.phrase_patterns.items():
            for pattern in patterns:
                ent_label, ent_id = self._split_label(label)
                p = {"label": ent_label, "pattern": pattern.text}
                if ent_id:
                    p["id"] = ent_id
                all_patterns.append(p)
        return all_patterns
    def add_patterns(self, patterns: List[PatternType]) -> None:
        """Add patterns to the entity ruler. A pattern can either be a token
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
        patterns (list): The patterns to add.
        DOCS: https://spacy.io/api/entityruler#add_patterns
        """
        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
        try:
            current_index = -1
            for i, (name, pipe) in enumerate(self.nlp.pipeline):
                if self == pipe:
                    current_index = i
                    break
            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
        except ValueError:
            subsequent_pipes = []
        with self.nlp.select_pipes(disable=subsequent_pipes):
            token_patterns = []
            phrase_pattern_labels = []
            phrase_pattern_texts = []
            phrase_pattern_ids = []
            for entry in patterns:
                if isinstance(entry["pattern"], str):
                    phrase_pattern_labels.append(entry["label"])
                    phrase_pattern_texts.append(entry["pattern"])
                    phrase_pattern_ids.append(entry.get("id"))
                elif isinstance(entry["pattern"], list):
                    token_patterns.append(entry)
            phrase_patterns = []
            for label, pattern, ent_id in zip(
                phrase_pattern_labels,
                self.nlp.pipe(phrase_pattern_texts),
                phrase_pattern_ids,
            ):
                phrase_pattern = {"label": label, "pattern": pattern}
                if ent_id:
                    phrase_pattern["id"] = ent_id
                phrase_patterns.append(phrase_pattern)
            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
                label = entry["label"]  # type: ignore
                if "id" in entry:
                    ent_label = label
                    label = self._create_label(label, entry["id"])
                    key = self.matcher._normalize_key(label)
                    self._ent_ids[key] = (ent_label, entry["id"])
                pattern = entry["pattern"]  # type: ignore
                if isinstance(pattern, Doc):
                    self.phrase_patterns[label].append(pattern)
                    self.phrase_matcher.add(label, [pattern])  # type: ignore
                elif isinstance(pattern, list):
                    self.token_patterns[label].append(pattern)
                    self.matcher.add(label, [pattern])
                else:
                    raise ValueError(Errors.E097.format(pattern=pattern))
    def clear(self) -> None:
        """Reset all patterns."""
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self._ent_ids = defaultdict(tuple)
        self.matcher = Matcher(
            self.nlp.vocab,
            validate=self._validate,
            fuzzy_compare=self.matcher_fuzzy_compare,
        )
        self.phrase_matcher = PhraseMatcher(
            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
        )
    def remove(self, ent_id: str) -> None:
        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
        ent_id (str): id of the pattern to be removed
        RETURNS: None
        DOCS: https://spacy.io/api/entityruler#remove
        """
        label_id_pairs = [
            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
        ]
        if not label_id_pairs:
            raise ValueError(
                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
            )
        created_labels = [
            self._create_label(label, eid) for (label, eid) in label_id_pairs
        ]
        # remove the patterns from self.phrase_patterns
        self.phrase_patterns = defaultdict(
            list,
            {
                label: val
                for (label, val) in self.phrase_patterns.items()
                if label not in created_labels
            },
        )
        # remove the patterns from self.token_pattern
        self.token_patterns = defaultdict(
            list,
            {
                label: val
                for (label, val) in self.token_patterns.items()
                if label not in created_labels
            },
        )
        # remove the patterns from self.token_pattern
        for label in created_labels:
            if label in self.phrase_matcher:
                self.phrase_matcher.remove(label)
            else:
                self.matcher.remove(label)
    def _require_patterns(self) -> None:
        """Raise a warning if this component has no patterns defined."""
        if len(self) == 0:
            warnings.warn(Warnings.W036.format(name=self.name))
    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
        label (str): The value of label in a pattern entry
        RETURNS (tuple): ent_label, ent_id
        """
        if self.ent_id_sep in label:
            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
        else:
            ent_label = label
            ent_id = None  # type: ignore
        return ent_label, ent_id
    def _create_label(self, label: Any, ent_id: Any) -> str:
        """Join Entity label with ent_id if the pattern has an `id` attribute
        If ent_id is not a string, the label is returned as is.
        label (str): The label to set for ent.label_
        ent_id (str): The label
        RETURNS (str): The ent_label joined with configured `ent_id_sep`
        """
        if isinstance(ent_id, str):
            label = f"{label}{self.ent_id_sep}{ent_id}"
        return label
    def from_bytes(
        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityRuler":
        """Load the entity ruler from a bytestring.
        patterns_bytes (bytes): The bytestring to load.
        RETURNS (EntityRuler): The loaded entity ruler.
        DOCS: https://spacy.io/api/entityruler#from_bytes
        """
        cfg = srsly.msgpack_loads(patterns_bytes)
        self.clear()
        if isinstance(cfg, dict):
            self.add_patterns(cfg.get("patterns", cfg))
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
            self.phrase_matcher = PhraseMatcher(
                self.nlp.vocab,
                attr=self.phrase_matcher_attr,
            )
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
        else:
            self.add_patterns(cfg)
        return self
    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
        """Serialize the entity ruler patterns to a bytestring.
        RETURNS (bytes): The serialized patterns.
        DOCS: https://spacy.io/api/entityruler#to_bytes
        """
        serial = {
            "overwrite": self.overwrite,
            "ent_id_sep": self.ent_id_sep,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "patterns": self.patterns,
        }
        return srsly.msgpack_dumps(serial)
    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityRuler":
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.
        path (str / Path): The JSONL file to load.
        RETURNS (EntityRuler): The loaded entity ruler.
        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        self.clear()
        depr_patterns_path = path.with_suffix(".jsonl")
        if path.suffix == ".jsonl":  # user provides a jsonl
            if path.is_file:
                patterns = srsly.read_jsonl(path)
                self.add_patterns(patterns)
            else:
                raise ValueError(Errors.E1023.format(path=path))
        elif depr_patterns_path.is_file():
            patterns = srsly.read_jsonl(depr_patterns_path)
            self.add_patterns(patterns)
        elif path.is_dir():  # path is a valid directory
            cfg = {}
            deserializers_patterns = {
                "patterns": lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix(".jsonl"))
                )
            }
            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
            from_disk(path, deserializers_cfg, {})
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
            self.phrase_matcher = PhraseMatcher(
                self.nlp.vocab, attr=self.phrase_matcher_attr
            )
            from_disk(path, deserializers_patterns, {})
        else:  # path is not a valid directory or file
            raise ValueError(Errors.E146.format(path=path))
        return self
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).
        path (str / Path): The JSONL file to save.
        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        cfg = {
            "overwrite": self.overwrite,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "patterns": lambda p: srsly.write_jsonl(
                p.with_suffix(".jsonl"), self.patterns
            ),
            "cfg": lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            to_disk(path, serializers, {})
--- a/spacy/pipeline/legacy/init.py
+++ b/spacy/pipeline/legacy/init.py
@ -1,3 +0,0 @@
 from .entity_linker import EntityLinker_v1
 __all__ = ["EntityLinker_v1"]
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@ -1,422 +0,0 @@
 # This file is present to provide a prior version of the EntityLinker component
 # for backwards compatability. For details see #9669.
 import random
 import warnings
 from itertools import islice
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 import srsly
 from thinc.api import CosineDistance, Model, Optimizer, set_dropout_rate
 from thinc.types import Floats2d
 from ... import util
 from ...errors import Errors, Warnings
 from ...kb import Candidate, KnowledgeBase
 from ...language import Language
 from ...ml import empty_kb
 from ...scorer import Scorer
 from ...tokens import Doc, Span
 from ...training import Example, validate_examples, validate_get_examples
 from ...util import SimpleFrozenList
 from ...vocab import Vocab
 from ..pipe import deserialize_config
 from ..trainable_pipe import TrainablePipe
 # See #9050
 BACKWARD_OVERWRITE = True
 def entity_linker_score(examples, **kwargs):
    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
 class EntityLinker_v1(TrainablePipe):
    """Pipeline component for named entity linking.
    DOCS: https://spacy.io/api/entitylinker
    """
    NIL = "NIL"  # string used to refer to a non-existing link
    def __init__(
        self,
        vocab: Vocab,
        model: Model,
        name: str = "entity_linker",
        *,
        labels_discard: Iterable[str],
        n_sents: int,
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
        overwrite: bool = BACKWARD_OVERWRITE,
        scorer: Optional[Callable] = entity_linker_score,
    ) -> None:
        """Initialize an entity linker.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
        n_sents (int): The number of neighbouring sentences to take into account.
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        DOCS: https://spacy.io/api/entitylinker#init
        """
        self.vocab = vocab
        self.model = model
        self.name = name
        self.labels_discard = list(labels_discard)
        self.n_sents = n_sents
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        # how many neighbour sentences to take into account
        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
        self.kb = empty_kb(entity_vector_length)(self.vocab)
        self.scorer = scorer
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
        create it using this object's vocab."""
        if not callable(kb_loader):
            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
        self.kb = kb_loader(self.vocab)
    def validate_kb(self) -> None:
        # Raise an error if the knowledge base is not initialized.
        if self.kb is None:
            raise ValueError(Errors.E1018.format(name=self.name))
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
            Note that providing this argument, will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.
        DOCS: https://spacy.io/api/entitylinker#initialize
        """
        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
        if kb_loader is not None:
            self.set_kb(kb_loader)
        self.validate_kb()
        nO = self.kb.entity_vector_length
        doc_sample = []
        vector_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )
    def update(
        self,
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.
        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.
        DOCS: https://spacy.io/api/entitylinker#update
        """
        self.validate_kb()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        if not examples:
            return losses
        validate_examples(examples, "EntityLinker_v1.update")
        sentence_docs = []
        for eg in examples:
            sentences = [s for s in eg.reference.sents]
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
            for ent in eg.reference.ents:
                # KB ID of the first token is the same as the whole span
                kb_id = kb_ids[ent.start]
                if kb_id:
                    try:
                        # find the sentence in the list of sentences.
                        sent_index = sentences.index(ent.sent)
                    except AttributeError:
                        # Catch the exception when ent.sent is None and provide a user-friendly warning
                        raise RuntimeError(Errors.E030) from None
                    # get n previous sentences, if there are any
                    start_sentence = max(0, sent_index - self.n_sents)
                    # get n posterior sentences, or as many < n as there are
                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
                    # get token positions
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    # append that span as a doc to training
                    sent_doc = eg.predicted[start_token:end_token].as_doc()
                    sentence_docs.append(sent_doc)
        set_dropout_rate(self.model, drop)
        if not sentence_docs:
            warnings.warn(Warnings.W093.format(name="Entity Linker"))
            return losses
        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
        loss, d_scores = self.get_loss(
            sentence_encodings=sentence_encodings, examples=examples
        )
        bp_context(d_scores)
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        return losses
    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker_v1.get_loss")
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
            for ent in eg.reference.ents:
                kb_id = kb_ids[ent.start]
                if kb_id:
                    entity_encoding = self.kb.get_vector(kb_id)
                    entity_encodings.append(entity_encoding)
        entity_encodings = self.model.ops.asarray2f(entity_encodings)
        if sentence_encodings.shape != entity_encodings.shape:
            err = Errors.E147.format(
                method="get_loss", msg="gold entities do not match up"
            )
            raise RuntimeError(err)
        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
        loss = loss / len(entity_encodings)
        return float(loss), gradients
    def predict(self, docs: Iterable[Doc]) -> List[str]:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.
        docs (Iterable[Doc]): The documents to predict.
        RETURNS (List[str]): The models prediction for each document.
        DOCS: https://spacy.io/api/entitylinker#predict
        """
        self.validate_kb()
        entity_count = 0
        final_kb_ids: List[str] = []
        if not docs:
            return final_kb_ids
        if isinstance(docs, Doc):
            docs = [docs]
        for i, doc in enumerate(docs):
            sentences = [s for s in doc.sents]
            if len(doc) > 0:
                # Looping through each entity (TODO: rewrite)
                for ent in doc.ents:
                    sent = ent.sent
                    sent_index = sentences.index(sent)
                    assert sent_index >= 0
                    # get n_neighbour sentences, clipped to the length of the document
                    start_sentence = max(0, sent_index - self.n_sents)
                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    sent_doc = doc[start_token:end_token].as_doc()
                    # currently, the context is the same for each entity in a sentence (should be refined)
                    xp = self.model.ops.xp
                    if self.incl_context:
                        sentence_encoding = self.model.predict([sent_doc])[0]
                        sentence_encoding_t = sentence_encoding.T
                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
                    entity_count += 1
                    if ent.label_ in self.labels_discard:
                        # ignoring this entity - setting to NIL
                        final_kb_ids.append(self.NIL)
                    else:
                        candidates = list(self.get_candidates(self.kb, ent))
                        if not candidates:
                            # no prediction possible for this entity - setting to NIL
                            final_kb_ids.append(self.NIL)
                        elif len(candidates) == 1:
                            # shortcut for efficiency reasons: take the 1 candidate
                            final_kb_ids.append(candidates[0].entity_)
                        else:
                            random.shuffle(candidates)
                            # set all prior probabilities to 0 if incl_prior=False
                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
                            if not self.incl_prior:
                                prior_probs = xp.asarray([0.0 for _ in candidates])
                            scores = prior_probs
                            # add in similarity from the context
                            if self.incl_context:
                                entity_encodings = xp.asarray(
                                    [c.entity_vector for c in candidates]
                                )
                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
                                if len(entity_encodings) != len(prior_probs):
                                    raise RuntimeError(
                                        Errors.E147.format(
                                            method="predict",
                                            msg="vectors not of equal length",
                                        )
                                    )
                                # cosine similarity
                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
                                    sentence_norm * entity_norm
                                )
                                if sims.shape != prior_probs.shape:
                                    raise ValueError(Errors.E161)
                                scores = prior_probs + sims - (prior_probs * sims)
                            best_index = scores.argmax().item()
                            best_candidate = candidates[best_index]
                            final_kb_ids.append(best_candidate.entity_)
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
            )
            raise RuntimeError(err)
        return final_kb_ids
    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
        DOCS: https://spacy.io/api/entitylinker#set_annotations
        """
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
        for doc in docs:
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
                    if token.ent_kb_id == 0 or overwrite:
                        token.ent_kb_id_ = kb_id
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.
        DOCS: https://spacy.io/api/entitylinker#to_bytes
        """
        self._validate_serialization_attrs()
        serialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
        serialize["kb"] = self.kb.to_bytes
        serialize["model"] = self.model.to_bytes
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the pipe from a bytestring.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.
        DOCS: https://spacy.io/api/entitylinker#from_bytes
        """
        self._validate_serialization_attrs()
        def load_model(b):
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149) from None
        deserialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
        deserialize["model"] = load_model
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Serialize the pipe to disk.
        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        DOCS: https://spacy.io/api/entitylinker#to_disk
        """
        serialize = {}
        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["kb"] = lambda p: self.kb.to_disk(p)
        serialize["model"] = lambda p: self.model.to_disk(p)
        util.to_disk(path, serialize, exclude)
    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityLinker_v1":
        """Load the pipe from disk. Modifies the object in place and returns it.
        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (EntityLinker): The modified EntityLinker object.
        DOCS: https://spacy.io/api/entitylinker#from_disk
        """
        def load_model(p):
            try:
                with p.open("rb") as infile:
                    self.model.from_bytes(infile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
        deserialize: Dict[str, Callable[[Any], Any]] = {}
        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
        deserialize["kb"] = lambda p: self.kb.from_disk(p)
        deserialize["model"] = load_model
        util.from_disk(path, deserialize, exclude)
        return self
    def rehearse(self, examples, *, sgd=None, losses=None, **config):
        raise NotImplementedError
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -2,6 +2,7 @@ import warnings
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 import srsly
 from thinc.api import Model
 from .. import util
@ -155,8 +156,24 @@ class Lemmatizer(Pipe):
        """
        required_tables, optional_tables = self.get_lookups_config(self.mode)
        if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
+            logger.debug(
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+                "Lemmatizer: no lemmatizer lookups tables provided, "
                "trying to load tables from registered lookups (usually "
                "spacy-lookups-data)"
            )
            lookups = load_lookups(
                lang=self.vocab.lang, tables=required_tables, strict=False
            )
            missing_tables = set(required_tables) - set(lookups.tables)
            if len(missing_tables) > 0:
                raise ValueError(
                    Errors.E4010.format(
                        missing_tables=list(missing_tables),
                        pipe_name=self.name,
                        required_tables=srsly.json_dumps(required_tables),
                        tables=srsly.json_dumps(required_tables + optional_tables),
                    )
                )
            optional_lookups = load_lookups(
                lang=self.vocab.lang, tables=optional_tables, strict=False
            )
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy
@ -15,7 +15,7 @@ from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
+from .tagger import ActivationsT, Tagger
 # See #9050
 BACKWARD_OVERWRITE = True
@ -50,8 +50,14 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
+    default_config={
-                    "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
+        "model": DEFAULT_MORPH_MODEL,
        "overwrite": True,
        "extend": False,
        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
        "label_smoothing": 0.0,
        "save_activations": False,
    },
    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@ -62,8 +68,10 @@ def make_morphologizer(
    extend: bool,
    label_smoothing: float,
    scorer: Optional[Callable],
    save_activations: bool,
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer,
                         save_activations=save_activations)
 def morphologizer_score(examples, **kwargs):
@ -99,6 +107,7 @@ class Morphologizer(Tagger):
        extend: bool = BACKWARD_EXTEND,
        label_smoothing: float = 0.0,
        scorer: Optional[Callable] = morphologizer_score,
        save_activations: bool = False,
    ):
        """Initialize a morphologizer.
@ -106,9 +115,12 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        overwrite (bool): Whether to overwrite existing annotations.
        extend (bool): Whether to extend existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attributes "pos" and "morph" and
            Scorer.score_token_attr_per_feat for the attribute "morph".
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/morphologizer#init
        """
@ -129,10 +141,11 @@ class Morphologizer(Tagger):
        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
        self.save_activations = save_activations
    @property
    def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
        return tuple(self.cfg["labels_morph"].keys())
    @property
@ -156,7 +169,7 @@ class Morphologizer(Tagger):
        # normalize label
        norm_label = self.vocab.morphology.normalize_features(label)
        # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
        pos = label_dict.get(self.POS_FEAT, "")
        if self.POS_FEAT in label_dict:
            label_dict.pop(self.POS_FEAT)
@ -194,7 +207,7 @@ class Morphologizer(Tagger):
                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        morph_dict[self.POS_FEAT] = pos
                    norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -211,7 +224,7 @@ class Morphologizer(Tagger):
            for i, token in enumerate(example.reference):
                pos = token.pos_
                morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                if pos:
                    morph_dict[self.POS_FEAT] = pos
                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -222,39 +235,47 @@ class Morphologizer(Tagger):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
        DOCS: https://spacy.io/api/morphologizer#set_annotations
        """
        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
        # We require random access for the upcoming ops, so we need
        # to allocate a compatible container out of the iterable.
        labels = tuple(self.labels)
        for i, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                for act_name, acts in activations.items():
                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                # set morph
                if doc.c[j].morph == 0 or overwrite or extend:
                    if overwrite and extend:
                        # morphologizer morph overwrites any existing features
                        # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    elif extend:
                        # existing features are preserved and any new features
                        # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    else:
                        # clobber
@ -296,7 +317,7 @@ class Morphologizer(Tagger):
                    label = None
                # Otherwise, generate the combined label
                else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -1,215 +0,0 @@
 # cython: infer_types=True, binding=True
 from typing import Optional
 import numpy
 from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
 from ..attrs import ID
 from ..errors import Errors
 from ..language import Language
 from ..training import validate_examples
 from .tagger import Tagger
 from .trainable_pipe import TrainablePipe
 default_model_config = """
 [model]
@architectures = "spacy.MultiTask.v1"
 maxout_pieces = 3
 token_vector_width = 96
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
 embed_size = 2000
 window_size = 1
 maxout_pieces = 2
 subword_features = true
 """
 DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "nn_labeller",
    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
 )
 def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
    return MultitaskObjective(nlp.vocab, model, name)
 class MultitaskObjective(Tagger):
    """Experimental: Assist training of a parser or tagger, by training a
    side-objective.
    """
    def __init__(self, vocab, model, name="nn_labeller", *, target):
        self.vocab = vocab
        self.model = model
        self.name = name
        if target == "dep":
            self.make_label = self.make_dep
        elif target == "tag":
            self.make_label = self.make_tag
        elif target == "ent":
            self.make_label = self.make_ent
        elif target == "dep_tag_offset":
            self.make_label = self.make_dep_tag_offset
        elif target == "ent_tag":
            self.make_label = self.make_ent_tag
        elif target == "sent_start":
            self.make_label = self.make_sent_start
        elif hasattr(target, "__call__"):
            self.make_label = target
        else:
            raise ValueError(Errors.E016)
        cfg = {"labels": {}, "target": target}
        self.cfg = dict(cfg)
    @property
    def labels(self):
        return self.cfg.setdefault("labels", {})
    @labels.setter
    def labels(self, value):
        self.cfg["labels"] = value
    def set_annotations(self, docs, dep_ids):
        pass
    def initialize(self, get_examples, nlp=None, labels=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
            raise ValueError(err)
        if labels is not None:
            self.labels = labels
        else:
            for example in get_examples():
                for token in example.y:
                    label = self.make_label(token)
                    if label is not None and label not in self.labels:
                        self.labels[label] = len(self.labels)
        self.model.initialize()   # TODO: fix initialization by defining X and Y
    def predict(self, docs):
        tokvecs = self.model.get_ref("tok2vec")(docs)
        scores = self.model.get_ref("softmax")(tokvecs)
        return tokvecs, scores
    def get_loss(self, examples, scores):
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
        for i, eg in enumerate(examples):
            # Handles alignment for tokenization differences
            _doc_annots = eg.get_aligned()  # TODO
            for j in range(len(eg.predicted)):
                tok_annots = {key: values[j] for key, values in tok_annots.items()}
                label = self.make_label(j, tok_annots)
                if label is None or label not in self.labels:
                    correct[idx] = guesses[idx]
                else:
                    correct[idx] = self.labels[label]
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
        loss = (d_scores**2).sum()
        return float(loss), d_scores
    @staticmethod
    def make_dep(token):
        return token.dep_
    @staticmethod
    def make_tag(token):
        return token.tag_
    @staticmethod
    def make_ent(token):
        if token.ent_iob_ == "O":
            return "O"
        else:
            return token.ent_iob_ + "-" + token.ent_type_
    @staticmethod
    def make_dep_tag_offset(token):
        dep = token.dep_
        tag = token.tag_
        offset = token.head.i - token.i
        offset = min(offset, 2)
        offset = max(offset, -2)
        return f"{dep}-{tag}:{offset}"
    @staticmethod
    def make_ent_tag(token):
        if token.ent_iob_ == "O":
            ent = "O"
        else:
            ent = token.ent_iob_ + "-" + token.ent_type_
        tag = token.tag_
        return f"{tag}-{ent}"
    @staticmethod
    def make_sent_start(token):
        """A multi-task objective for representing sentence boundaries,
        using BILU scheme. (O is impossible)
        """
        if token.is_sent_start and token.is_sent_end:
            return "U-SENT"
        elif token.is_sent_start:
            return "B-SENT"
        else:
            return "I-SENT"
 class ClozeMultitask(TrainablePipe):
    def __init__(self, vocab, model, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = cfg
        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
    def set_annotations(self, docs, dep_ids):
        pass
    def initialize(self, get_examples, nlp=None):
        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
        self.model.output_layer.initialize(X)
    def predict(self, docs):
        tokvecs = self.model.get_ref("tok2vec")(docs)
        vectors = self.model.get_ref("output_layer")(tokvecs)
        return tokvecs, vectors
    def get_loss(self, examples, vectors, prediction):
        validate_examples(examples, "ClozeMultitask.get_loss")
        # The simplest way to implement this would be to vstack the
        # token.vector values, but that's a bit inefficient, especially on GPU.
        # Instead we fetch the index into the vectors table for each of our tokens,
        # and look them up all at once. This prevents data copying.
        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
        target = vectors[ids]
        gradient = self.distance.get_grad(prediction, target)
        loss = self.distance.get_loss(prediction, target)
        return float(loss), gradient
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        pass
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        set_dropout_rate(self.model, drop)
        validate_examples(examples, "ClozeMultitask.rehearse")
        predictions, bp_predictions = self.model.begin_update()
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
        bp_predictions(d_predictions)
        if sgd is not None:
            self.finish_update(sgd)
        if losses is not None:
            losses[self.name] += loss
        return losses
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -4,6 +4,11 @@ from typing import Callable, Optional
 from thinc.api import Config, Model
 from ..language import Language
 from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
 from ._parser_internals.ner cimport BiluoPushDown
@ -245,8 +250,11 @@ cdef class EntityRecognizer(Parser):
    def labels(self):
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        labels = set(remove_bilu_prefix(move) for move in self.move_names
+        labels = set(
-                     if move[0] in ("B", "I", "L", "U"))
+            remove_bilu_prefix(move)
            for move in self.move_names
            if move[0] in ("B", "I", "L", "U")
        )
        return tuple(sorted(labels))
    def scored_ents(self, beams):
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True, binding=True
 import warnings
 from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 import srsly
 from ..tokens.doc cimport Doc
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
@ -21,13 +20,6 @@ cdef class Pipe:
    DOCS: https://spacy.io/api/pipe
    """
    @classmethod
    def __init_subclass__(cls, **kwargs):
        """Raise a warning if an inheriting class implements 'begin_training'
         (from v2) instead of the new 'initialize' method (from v3)"""
        if hasattr(cls, "begin_training"):
            warnings.warn(Warnings.W088.format(name=cls.__name__))
    def __call__(self, Doc doc) -> Doc:
        """Apply the pipe to one document. The document is modified in place,
        and returned. This usually happens under the hood when the nlp object
@ -96,6 +88,10 @@ cdef class Pipe:
            return self.scorer(examples, **scorer_kwargs)
        return {}
    @property
    def is_distillable(self) -> bool:
        return False
    @property
    def is_trainable(self) -> bool:
        return False
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -10,9 +10,6 @@ from ..language import Language
 from .pipe import Pipe
 from .senter import senter_score
 # see #9050
 BACKWARD_OVERWRITE = False
@Language.factory(
    "sentencizer",
@ -55,13 +52,14 @@ class Sentencizer(Pipe):
        name="sentencizer",
        *,
        punct_chars=None,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=senter_score,
    ):
        """Initialize the sentencizer.
        punct_chars (list): Punctuation characters to split on. Will be
            serialized with the nlp object.
        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, binding=True
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy
@ -12,10 +12,7 @@ from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
+from .tagger import ActivationsT, Tagger
 # See #9050
 BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
@ -37,11 +34,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "senter",
    assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={
        "model": DEFAULT_SENTER_MODEL,
        "overwrite": False,
        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
        "save_activations": False,
    },
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
+def make_senter(nlp: Language,
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+                name: str,
                model: Model,
                overwrite: bool,
                scorer: Optional[Callable],
                save_activations: bool):
    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
 def senter_score(examples, **kwargs):
@ -69,8 +76,9 @@ class SentenceRecognizer(Tagger):
        model,
        name="senter",
        *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=senter_score,
        save_activations: bool = False,
    ):
        """Initialize a sentence recognizer.
@ -78,8 +86,10 @@ class SentenceRecognizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/sentencerecognizer#init
        """
@ -89,6 +99,7 @@ class SentenceRecognizer(Tagger):
        self._rehearsal_model = None
        self.cfg = {"overwrite": overwrite}
        self.scorer = scorer
        self.save_activations = save_activations
    @property
    def labels(self):
@ -106,19 +117,24 @@ class SentenceRecognizer(Tagger):
    def label_data(self):
        return None
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
        DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
        """
        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                for act_name, acts in activations.items():
                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@ -22,7 +22,7 @@ from ..errors import Errors, Warnings
 from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import Scorer
+from ..scorer import Scorer, get_ner_prf
 from ..tokens import Doc, Span
 from ..training import Example
 from ..util import SimpleFrozenList, ensure_path, registry
@ -33,7 +33,7 @@ DEFAULT_SPANS_KEY = "ruler"
@Language.factory(
-    "future_entity_ruler",
+    "entity_ruler",
    assigns=["doc.ents"],
    default_config={
        "phrase_matcher_attr": None,
@ -79,6 +79,15 @@ def make_entity_ruler(
    )
 def entity_ruler_score(examples, **kwargs):
    return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
 def make_entity_ruler_scorer():
    return entity_ruler_score
@Language.factory(
    "span_ruler",
    assigns=["doc.spans"],
@ -136,7 +145,7 @@ def prioritize_new_ents_filter(
 ) -> List[Span]:
    """Merge entities and spans into one list without overlaps by allowing
    spans to overwrite any entities that they overlap with. Intended to
-    replicate the overwrite_ents=True behavior from the EntityRuler.
+    replicate the overwrite_ents=True behavior from the v3 EntityRuler.
    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
@ -167,7 +176,7 @@ def prioritize_existing_ents_filter(
 ) -> List[Span]:
    """Merge entities and spans into one list without overlaps by prioritizing
    existing entities. Intended to replicate the overwrite_ents=False behavior
-    from the EntityRuler.
+    from the v3 EntityRuler.
    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,12 +1,23 @@
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Protocol,
    Tuple,
    Union,
    cast,
    runtime_checkable,
 )
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
 from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 from ..compat import Protocol, runtime_checkable
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@ -16,6 +27,9 @@ from ..util import registry
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 ActivationsT = Dict[str, Union[Floats2d, Ragged]]
 spancat_default_config = """
 [model]
@architectures = "spacy.SpanCategorizer.v1"
@ -170,6 +184,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
        "model": DEFAULT_SPANCAT_MODEL,
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
        "save_activations": False,
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -182,6 +197,7 @@ def make_spancat(
    scorer: Optional[Callable],
    threshold: float,
    max_positive: Optional[int],
    save_activations: bool,
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component and configure it for multi-label
    classification to be able to assign multiple labels for each span.
@ -209,6 +225,7 @@ def make_spancat(
        0.5.
    max_positive (Optional[int]): Maximum number of labels to consider positive
        per span. Defaults to None, indicating no limit.
    save_activations (bool): save model activations in Doc when annotating.
    """
    return SpanCategorizer(
        nlp.vocab,
@ -222,6 +239,7 @@ def make_spancat(
        threshold=threshold,
        scorer=scorer,
        add_negative_label=False,
        save_activations=save_activations,
    )
@ -235,6 +253,7 @@ def make_spancat(
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
        "allow_overlap": True,
        "save_activations": False,
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -247,6 +266,7 @@ def make_spancat_singlelabel(
    negative_weight: float,
    allow_overlap: bool,
    scorer: Optional[Callable],
    save_activations: bool,
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component and configure it for multi-class
    classification. With this configuration each span can get at most one
@ -274,6 +294,7 @@ def make_spancat_singlelabel(
    allow_overlap (bool): If True the data is assumed to contain overlapping spans.
        Otherwise it produces non-overlapping spans greedily prioritizing
        higher assigned label scores.
    save_activations (bool): save model activations in Doc when annotating.
    """
    return SpanCategorizer(
        nlp.vocab,
@ -287,6 +308,7 @@ def make_spancat_singlelabel(
        add_negative_label=True,
        threshold=None,
        scorer=scorer,
        save_activations=save_activations,
    )
@ -349,6 +371,7 @@ class SpanCategorizer(TrainablePipe):
        max_positive: Optional[int] = None,
        threshold: Optional[float] = 0.5,
        scorer: Optional[Callable] = spancat_score,
        save_activations: bool = False,
    ) -> None:
        """Initialize the multi-label or multi-class span categorizer.
@ -398,6 +421,7 @@ class SpanCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self.scorer = scorer
        self.save_activations = save_activations
        self.add_negative_label = add_negative_label
        if not allow_overlap and max_positive is not None and max_positive > 1:
            raise ValueError(Errors.E1051.format(max_positive=max_positive))
@ -479,7 +503,7 @@ class SpanCategorizer(TrainablePipe):
        else:
            return None
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        docs (Iterable[Doc]): The documents to predict.
@ -492,7 +516,7 @@ class SpanCategorizer(TrainablePipe):
            scores = self.model.ops.alloc2f(0, 0)
        else:
            scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        return {"indices": indices, "scores": scores}
    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@ -512,18 +536,27 @@ class SpanCategorizer(TrainablePipe):
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])
-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to set, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
        DOCS: https://spacy.io/api/spancategorizer#set_annotations
        """
-        indices, scores = indices_scores
+        indices = activations["indices"]
        assert isinstance(indices, Ragged)
        scores = cast(Floats2d, activations["scores"])
        offset = 0
        for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
+            indices_i = cast(Ints2d, indices[i].dataXd)
            if self.save_activations:
                doc.activations[self.name] = {}
                doc.activations[self.name]["indices"] = indices_i
                doc.activations[self.name]["scores"] = scores[
                    offset : offset + indices.lengths[i]
                ]
            allow_overlap = cast(bool, self.cfg["allow_overlap"])
            if self.cfg["max_positive"] == 1:
                doc.spans[self.key] = self._make_span_group_singlelabel(
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,9 +1,10 @@
 # cython: infer_types=True, binding=True
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 import numpy
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
 from thinc.types import Floats2d, Ints1d
 from ..tokens.doc cimport Doc
@ -15,8 +16,7 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .trainable_pipe import TrainablePipe
-# See #9050
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
@ -38,7 +38,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
+    default_config={
        "model": DEFAULT_TAGGER_MODEL,
        "overwrite": False,
        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
        "neg_prefix": "!",
        "label_smoothing": 0.0,
        "save_activations": False,
    },
    default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@ -49,6 +56,7 @@ def make_tagger(
    scorer: Optional[Callable],
    neg_prefix: str,
    label_smoothing: float,
    save_activations: bool,
 ):
    """Construct a part-of-speech tagger component.
@ -57,7 +65,8 @@ def make_tagger(
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
                  label_smoothing=label_smoothing, save_activations=save_activations)
 def tagger_score(examples, **kwargs):
@ -80,10 +89,11 @@ class Tagger(TrainablePipe):
        model,
        name="tagger",
        *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=tagger_score,
        neg_prefix="!",
        label_smoothing=0.0,
        save_activations: bool = False,
    ):
        """Initialize a part-of-speech tagger.
@ -91,8 +101,10 @@ class Tagger(TrainablePipe):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "tag".
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/tagger#init
        """
@ -100,9 +112,15 @@ class Tagger(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
+        cfg = {
            "labels": [],
            "overwrite": overwrite,
            "neg_prefix": neg_prefix,
            "label_smoothing": label_smoothing
        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
        self.save_activations = save_activations
    @property
    def labels(self):
@ -121,7 +139,7 @@ class Tagger(TrainablePipe):
        """Data about the labels currently added to the component."""
        return tuple(self.cfg["labels"])
-    def predict(self, docs):
+    def predict(self, docs) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        docs (Iterable[Doc]): The documents to predict.
@ -134,12 +152,12 @@ class Tagger(TrainablePipe):
            n_labels = len(self.labels)
            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
            assert len(guesses) == len(docs)
-            return guesses
+            return {"probabilities": guesses, "label_ids": guesses}
        scores = self.model.predict(docs)
        assert len(scores) == len(docs), (len(scores), len(docs))
        guesses = self._scores2guesses(scores)
        assert len(guesses) == len(docs)
-        return guesses
+        return {"probabilities": scores, "label_ids": guesses}
    def _scores2guesses(self, scores):
        guesses = []
@ -150,20 +168,25 @@ class Tagger(TrainablePipe):
            guesses.append(doc_guesses)
        return guesses
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
        DOCS: https://spacy.io/api/tagger#set_annotations
        """
        batch_tag_ids = activations["label_ids"]
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        labels = self.labels
        for i, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                for act_name, acts in activations.items():
                    doc.activations[self.name][act_name] = acts[i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
@ -219,7 +242,6 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#rehearse
        """
        loss_func = SequenceCategoricalCrossentropy()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -233,12 +255,32 @@ class Tagger(TrainablePipe):
        set_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update(docs)
        tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
        bp_tag_scores(grads)
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        return losses
    def get_teacher_student_loss(
        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
    ) -> Tuple[float, List[Floats2d]]:
        """Calculate the loss and its gradient for a batch of student
        scores, relative to teacher scores.
        teacher_scores: Scores representing the teacher model's predictions.
        student_scores: Scores representing the student model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
        """
        loss_func = SequenceCategoricalCrossentropy(normalize=False)
        d_scores, loss = loss_func(student_scores, teacher_scores)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
@ -250,7 +292,12 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(
            names=self.labels,
            normalize=False,
            neg_prefix=self.cfg["neg_prefix"],
            label_smoothing=self.cfg["label_smoothing"]
        )
        # Convert empty tag "" to missing value None so that both misaligned
        # tokens and tokens with missing annotation have the default missing
        # value None.
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 import numpy
 from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate
@ -14,6 +14,9 @@ from ..util import registry
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 ActivationsT = Dict[str, Floats2d]
 single_label_default_config = """
 [model]
@architectures = "spacy.TextCatEnsemble.v2"
@ -81,6 +84,7 @@ subword_features = true
        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
        "save_activations": False,
    },
    default_score_weights={
        "cats_score": 1.0,
@ -101,6 +105,7 @@ def make_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
    save_activations: bool,
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -110,8 +115,16 @@ def make_textcat(
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
    scorer (Optional[Callable]): The scoring method.
    save_activations (bool): save model activations in Doc when annotating.
    """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+    return TextCategorizer(
        nlp.vocab,
        model,
        name,
        threshold=threshold,
        scorer=scorer,
        save_activations=save_activations,
    )
 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@ -142,6 +155,7 @@ class TextCategorizer(TrainablePipe):
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_score,
        save_activations: bool = False,
    ) -> None:
        """Initialize a text categorizer for single-label classification.
@ -167,6 +181,7 @@ class TextCategorizer(TrainablePipe):
        }
        self.cfg = dict(cfg)
        self.scorer = scorer
        self.save_activations = save_activations
    @property
    def support_missing_values(self):
@ -191,7 +206,7 @@ class TextCategorizer(TrainablePipe):
        """
        return self.labels  # type: ignore[return-value]
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        docs (Iterable[Doc]): The documents to predict.
@ -204,12 +219,12 @@ class TextCategorizer(TrainablePipe):
            tensors = [doc.tensor for doc in docs]
            xp = self.model.ops.xp
            scores = xp.zeros((len(list(docs)), len(self.labels)))
-            return scores
+            return {"probabilities": scores}
        scores = self.model.predict(docs)
        scores = self.model.ops.asarray(scores)
-        return scores
+        return {"probabilities": scores}
-    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
@ -217,9 +232,13 @@ class TextCategorizer(TrainablePipe):
        DOCS: https://spacy.io/api/textcategorizer#set_annotations
        """
        probs = activations["probabilities"]
        for i, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                doc.activations[self.name]["probabilities"] = probs[i]
            for j, label in enumerate(self.labels):
-                doc.cats[label] = float(scores[i, j])
+                doc.cats[label] = float(probs[i, j])
    def update(
        self,
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 from thinc.api import Config, Model
 from thinc.types import Floats2d
@ -79,6 +79,7 @@ subword_features = true
        "threshold": 0.5,
        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
        "save_activations": False,
    },
    default_score_weights={
        "cats_score": 1.0,
@ -99,8 +100,9 @@ def make_multilabel_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
    save_activations: bool,
 ) -> "MultiLabel_TextCategorizer":
-    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
    to be non-mutually exclusive, which means that there can be zero or more labels
    per doc).
@ -111,7 +113,12 @@ def make_multilabel_textcat(
    scorer (Optional[Callable]): The scoring method.
    """
    return MultiLabel_TextCategorizer(
-        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+        nlp.vocab,
        model,
        name,
        threshold=threshold,
        scorer=scorer,
        save_activations=save_activations,
    )
@ -143,6 +150,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_multilabel_score,
        save_activations: bool = False,
    ) -> None:
        """Initialize a text categorizer for multi-label classification.
@ -152,6 +160,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
        scorer (Optional[Callable]): The scoring method.
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/textcategorizer#init
        """
@ -162,6 +171,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        cfg = {"labels": [], "threshold": threshold}
        self.cfg = dict(cfg)
        self.scorer = scorer
        self.save_activations = save_activations
    @property
    def support_missing_values(self):
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,7 +1,8 @@
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 from thinc.api import Config, Model, Optimizer, set_dropout_rate
 from thinc.types import Floats2d
 from ..errors import Errors
 from ..language import Language
@ -158,39 +159,9 @@ class Tok2Vec(TrainablePipe):
        DOCS: https://spacy.io/api/tok2vec#update
        """
        if losses is None:
            losses = {}
        validate_examples(examples, "Tok2Vec.update")
        docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
+        return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
        tokvecs, bp_tokvecs = self.model.begin_update(docs)
        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        losses.setdefault(self.name, 0.0)
        def accumulate_gradient(one_d_tokvecs):
            """Accumulate tok2vec loss and gradient. This is passed as a callback
            to all but the last listener. Only the last one does the backprop.
            """
            nonlocal d_tokvecs
            for i in range(len(one_d_tokvecs)):
                d_tokvecs[i] += one_d_tokvecs[i]
                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        def backprop(one_d_tokvecs):
            """Callback to actually do the backprop. Passed to last listener."""
            accumulate_gradient(one_d_tokvecs)
            d_docs = bp_tokvecs(d_tokvecs)
            if sgd is not None:
                self.finish_update(sgd)
            return d_docs
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners[:-1]:
            listener.receive(batch_id, tokvecs, accumulate_gradient)
        if self.listeners:
            self.listeners[-1].receive(batch_id, tokvecs, backprop)
        return losses
    def get_loss(self, examples, scores) -> None:
        pass
@ -220,6 +191,96 @@ class Tok2Vec(TrainablePipe):
    def add_label(self, label):
        raise NotImplementedError
    def distill(
        self,
        teacher_pipe: Optional["TrainablePipe"],
        examples: Iterable["Example"],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        """Performs an update of the student pipe's model using the
        student's distillation examples and sets the annotations
        of the teacher's distillation examples using the teacher pipe.
        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
            for prediction.
        examples (Iterable[Example]): Distillation examples. The reference (teacher)
            and predicted (student) docs must have the same number of tokens and the
            same orthography.
        drop (float): dropout rate.
        sgd (Optional[Optimizer]): An optimizer. Will be created via
            create_optimizer if not set.
        losses (Optional[Dict[str, float]]): Optional record of loss during
            distillation.
        RETURNS: The updated losses dictionary.
        DOCS: https://spacy.io/api/tok2vec#distill
        """
        # By default we require a teacher pipe, but there are downstream
        # implementations that don't require a pipe.
        if teacher_pipe is None:
            raise ValueError(Errors.E4002.format(name=self.name))
        teacher_docs = [eg.reference for eg in examples]
        student_docs = [eg.predicted for eg in examples]
        teacher_preds = teacher_pipe.predict(teacher_docs)
        teacher_pipe.set_annotations(teacher_docs, teacher_preds)
        return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
    def _update_with_docs(
        self,
        docs: Iterable[Doc],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ):
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        set_dropout_rate(self.model, drop)
        tokvecs, accumulate_gradient, backprop = self._create_backprops(
            docs, losses, sgd=sgd
        )
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners[:-1]:
            listener.receive(batch_id, tokvecs, accumulate_gradient)
        if self.listeners:
            self.listeners[-1].receive(batch_id, tokvecs, backprop)
        return losses
    def _create_backprops(
        self,
        docs: Iterable[Doc],
        losses: Dict[str, float],
        *,
        sgd: Optional[Optimizer] = None,
    ) -> Tuple[Floats2d, Callable, Callable]:
        tokvecs, bp_tokvecs = self.model.begin_update(docs)
        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        def accumulate_gradient(one_d_tokvecs):
            """Accumulate tok2vec loss and gradient. This is passed as a callback
            to all but the last listener. Only the last one does the backprop.
            """
            nonlocal d_tokvecs
            for i in range(len(one_d_tokvecs)):
                d_tokvecs[i] += one_d_tokvecs[i]
                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        def backprop(one_d_tokvecs):
            """Callback to actually do the backprop. Passed to last listener."""
            accumulate_gradient(one_d_tokvecs)
            d_docs = bp_tokvecs(d_tokvecs)
            if sgd is not None:
                self.finish_update(sgd)
            return d_docs
        return tokvecs, accumulate_gradient, backprop
 class Tok2VecListener(Model):
    """A layer that gets fed its answers from an upstream connection,
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe):
    cdef public object model
    cdef public object cfg
    cdef public object scorer
    cdef bint _save_activations
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -9,7 +9,7 @@ from ..tokens.doc cimport Doc
 from .. import util
 from ..errors import Errors
 from ..language import Language
-from ..training import Example, validate_examples
+from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
 from .pipe import Pipe, deserialize_config
@ -55,6 +55,53 @@ cdef class TrainablePipe(Pipe):
        except Exception as e:
            error_handler(self.name, self, [doc], e)
    def distill(self,
                teacher_pipe: Optional["TrainablePipe"],
                examples: Iterable["Example"],
                *,
                drop: float = 0.0,
                sgd: Optional[Optimizer] = None,
                losses: Optional[Dict[str, float]] = None
                ) -> Dict[str, float]:
        """Train a pipe (the student) on the predictions of another pipe
        (the teacher). The student is typically trained on the probability
        distribution of the teacher, but details may differ per pipe.
        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
            from.
        examples (Iterable[Example]): Distillation examples. The reference
            (teacher) and predicted (student) docs must have the same number of
            tokens and the same orthography.
        drop (float): dropout rate.
        sgd (Optional[Optimizer]): An optimizer. Will be created via
            create_optimizer if not set.
        losses (Optional[Dict[str, float]]): Optional record of loss during
            distillation.
        RETURNS: The updated losses dictionary.
        DOCS: https://spacy.io/api/pipe#distill
        """
        # By default we require a teacher pipe, but there are downstream
        # implementations that don't require a pipe.
        if teacher_pipe is None:
            raise ValueError(Errors.E4002.format(name=self.name))
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        validate_distillation_examples(examples, "TrainablePipe.distill")
        set_dropout_rate(self.model, drop)
        for node in teacher_pipe.model.walk():
            if node.name == "softmax":
                node.attrs["softmax_normalize"] = True
        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
        bp_student_scores(d_scores)
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        return losses
    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
@ -168,6 +215,19 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
    def get_teacher_student_loss(self, teacher_scores, student_scores):
        """Calculate the loss and its gradient for a batch of student
        scores, relative to teacher scores.
        teacher_scores: Scores representing the teacher model's predictions.
        student_scores: Scores representing the student model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
        """
        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
    def create_optimizer(self) -> Optimizer:
        """Create an optimizer for the pipeline component.
@ -204,6 +264,14 @@ cdef class TrainablePipe(Pipe):
        """
        raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
    @property
    def is_distillable(self) -> bool:
        # Normally a pipe overrides `get_teacher_student_loss` to implement
        # distillation. In more exceptional cases, a pipe can provide its
        # own `distill` implementation. If neither of these methods is
        # overridden, the pipe does not implement distillation.
        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
    @property
    def is_trainable(self) -> bool:
        return True
@ -342,3 +410,11 @@ cdef class TrainablePipe(Pipe):
        deserialize["model"] = load_model
        util.from_disk(path, deserialize, exclude)
        return self
    @property
    def save_activations(self):
        return self._save_activations
    @save_activations.setter
    def save_activations(self, save_activations: bool):
        self._save_activations = save_activations
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@ -12,19 +12,10 @@ cdef class Parser(TrainablePipe):
    cdef public object _rehearsal_model
    cdef readonly TransitionSystem moves
    cdef public object _multitasks
    cdef object _cpu_ops
-    cdef void _parseC(
+    cdef void _parseC(self, CBlas cblas, StateC** states,
-        self,
+                      WeightsC weights, SizesC sizes) nogil
        CBlas cblas,
        StateC** states,
        WeightsC weights,
        SizesC sizes
    ) nogil
-    cdef void c_transition_batch(
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
-        self,
+                                 int nr_class, int batch_size) nogil
        StateC** states,
        const float* scores,
        int nr_class,
        int batch_size
    ) nogil
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -2,6 +2,8 @@
 # cython: profile=False
 from __future__ import print_function
 from typing import Dict, Iterable, List, Optional, Tuple
 cimport numpy as np
 from cymem.cymem cimport Pool
@ -16,7 +18,18 @@ import random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import CupyOps, NumpyOps, set_dropout_rate
+from thinc.api import (
    CupyOps,
    NumpyOps,
    Optimizer,
    SequenceCategoricalCrossentropy,
    chain,
    get_ops,
    set_dropout_rate,
    softmax_activation,
    use_ops,
 )
 from thinc.types import Floats2d
 from ..ml.parser_model cimport (
    ActivationsC,
@ -37,9 +50,22 @@ from .trainable_pipe import TrainablePipe
 from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ._parser_internals cimport _beam_utils
 from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.transition_system cimport Transition
 from .trainable_pipe cimport TrainablePipe
 from .. import util
 from ..errors import Errors
-from ..training import validate_examples, validate_get_examples
+from ..training import (
    validate_distillation_examples,
    validate_examples,
    validate_get_examples,
 )
 from ._parser_internals import _beam_utils
 NUMPY_OPS = NumpyOps()
@ -135,6 +161,7 @@ cdef class Parser(TrainablePipe):
        self._rehearsal_model = None
        self.scorer = scorer
        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
    def __getnewargs_ex__(self):
        """This allows pickling the Parser and its keyword-only init arguments"""
@ -214,6 +241,121 @@ cdef class Parser(TrainablePipe):
        # Defined in subclasses, to avoid circular import
        raise NotImplementedError
    def distill(self,
                teacher_pipe: Optional[TrainablePipe],
                examples: Iterable["Example"],
                *,
                drop: float = 0.0,
                sgd: Optional[Optimizer] = None,
                losses: Optional[Dict[str, float]] = None
                ):
        """Train a pipe (the student) on the predictions of another pipe
        (the teacher). The student is trained on the transition probabilities
        of the teacher.
        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
            from.
        examples (Iterable[Example]): Distillation examples. The reference
            (teacher) and predicted (student) docs must have the same number of
            tokens and the same orthography.
        drop (float): dropout rate.
        sgd (Optional[Optimizer]): An optimizer. Will be created via
            create_optimizer if not set.
        losses (Optional[Dict[str, float]]): Optional record of loss during
            distillation.
        RETURNS: The updated losses dictionary.
        DOCS: https://spacy.io/api/dependencyparser#distill
        """
        if teacher_pipe is None:
            raise ValueError(Errors.E4002.format(name=self.name))
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        validate_distillation_examples(examples, "TransitionParser.distill")
        set_dropout_rate(self.model, drop)
        student_docs = [eg.predicted for eg in examples]
        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
        # Add softmax activation, so that we can compute student losses
        # with cross-entropy loss.
        with use_ops("numpy"):
            teacher_model = chain(teacher_step_model, softmax_activation())
            student_model = chain(student_step_model, softmax_activation())
        max_moves = self.cfg["update_with_oracle_cut_size"]
        if max_moves >= 1:
            # Chop sequences into lengths of this many words, to make the
            # batch uniform length. Since we do not have a gold standard
            # sequence, we use the teacher's predictions as the gold
            # standard.
            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
            states = self._init_batch(teacher_step_model, student_docs, max_moves)
        else:
            states = self.moves.init_batch(student_docs)
        loss = 0.0
        n_moves = 0
        while states:
            # We do distillation as follows: (1) for every state, we compute the
            # transition softmax distributions: (2) we backpropagate the error of
            # the student (compared to the teacher) into the student model; (3)
            # for all states, we move to the next state using the student's
            # predictions.
            teacher_scores = teacher_model.predict(states)
            student_scores, backprop = student_model.begin_update(states)
            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
            backprop(d_scores)
            loss += state_loss
            self.transition_states(states, student_scores)
            states = [state for state in states if not state.is_final()]
            # Stop when we reach the maximum number of moves, otherwise we start
            # to process the remainder of cut sequences again.
            if max_moves >= 1 and n_moves >= max_moves:
                break
            n_moves += 1
        backprop_tok2vec(student_docs)
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        del backprop
        del backprop_tok2vec
        teacher_step_model.clear_memory()
        student_step_model.clear_memory()
        del teacher_model
        del student_model
        return losses
    def get_teacher_student_loss(
        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
    ) -> Tuple[float, List[Floats2d]]:
        """Calculate the loss and its gradient for a batch of student
        scores, relative to teacher scores.
        teacher_scores: Scores representing the teacher model's predictions.
        student_scores: Scores representing the student model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
        """
        loss_func = SequenceCategoricalCrossentropy(normalize=False)
        d_scores, loss = loss_func(student_scores, teacher_scores)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores
    def init_multitask_objectives(self, get_examples, pipeline, **cfg):
        """Setup models for secondary objectives, to benefit from multi-task
        learning. This method is intended to be overridden by subclasses.
@ -273,12 +415,7 @@ cdef class Parser(TrainablePipe):
    def greedy_parse(self, docs, drop=0.):
        cdef vector[StateC*] states
        cdef StateClass state
-        ops = self.model.ops
+        cdef CBlas cblas = self._cpu_ops.cblas()
        cdef CBlas cblas
        if isinstance(ops, CupyOps):
            cblas = NUMPY_OPS.cblas()
        else:
            cblas = ops.cblas()
        self._ensure_labels_are_added(docs)
        set_dropout_rate(self.model, drop)
        batch = self.moves.init_batch(docs)
@ -314,18 +451,16 @@ cdef class Parser(TrainablePipe):
        del model
        return list(batch)
-    cdef void _parseC(
+    cdef void _parseC(self, CBlas cblas, StateC** states,
-        self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
+                      WeightsC weights, SizesC sizes) nogil:
    ) nogil:
        cdef int i
        cdef vector[StateC*] unfinished
        cdef ActivationsC activations = alloc_activations(sizes)
        while sizes.states >= 1:
            predict_states(cblas, &activations, states, &weights, sizes)
            # Validate actions, argmax, take action.
-            self.c_transition_batch(
+            self.c_transition_batch(states, activations.scores,
-                states, activations.scores, sizes.classes, sizes.states
+                                    sizes.classes, sizes.states)
            )
            for i in range(sizes.states):
                if not states[i].is_final():
                    unfinished.push_back(states[i])
@ -353,13 +488,8 @@ cdef class Parser(TrainablePipe):
        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
        return [state for state in states if not state.c.is_final()]
-    cdef void c_transition_batch(
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
-        self,
+                                 int nr_class, int batch_size) nogil:
        StateC** states,
        const float* scores,
        int nr_class,
        int batch_size
    ) nogil:
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        with gil:
            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@ -497,16 +627,8 @@ cdef class Parser(TrainablePipe):
        del tutor
        return losses
-    def update_beam(
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
-        self,
+                    losses=None, beam_density=0.0):
        examples,
        *,
        beam_width,
        drop=0.,
        sgd=None,
        losses=None,
        beam_density=0.0
    ):
        states, golds, _ = self.moves.init_gold_batch(examples)
        if not states:
            return losses
@ -536,9 +658,8 @@ cdef class Parser(TrainablePipe):
        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros(
+        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-            (len(states), self.moves.n_moves), dtype='f', order='C'
+                                               dtype='f', order='C')
        )
        c_d_scores = <float*>d_scores.data
        unseen_classes = self.model.attrs["unseen_classes"]
        for i, (state, gold) in enumerate(zip(states, golds)):
@ -548,9 +669,8 @@ cdef class Parser(TrainablePipe):
            for j in range(self.moves.n_moves):
                if costs[j] <= 0.0 and j in unseen_classes:
                    unseen_classes.remove(j)
-            cpu_log_loss(
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
-                c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
+                         d_scores.shape[1])
            )
            c_d_scores += d_scores.shape[1]
        # Note that we don't normalize this. See comment in update() for why.
        if losses is not None:
@ -652,6 +772,36 @@ cdef class Parser(TrainablePipe):
                    raise ValueError(Errors.E149) from None
        return self
    def _init_batch(self, teacher_step_model, docs, max_length):
        """Make a square batch of length equal to the shortest transition
        sequence or a cap. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
        where N is the shortest doc. We'll make two states, one representing
        long_doc[:N], and another representing long_doc[N:]. In contrast to
        _init_gold_batch, this version uses a teacher model to generate the
        cut sequences."""
        cdef StateClass state
        all_states = self.moves.init_batch(docs)
        states = []
        to_cut = []
        for state, doc in zip(all_states, docs):
            if not state.is_final():
                if len(doc) < max_length:
                    states.append(state)
                else:
                    to_cut.append(state)
        while to_cut:
            states.extend(state.copy() for state in to_cut)
            # Move states forward max_length actions.
            length = 0
            while to_cut and length < max_length:
                teacher_scores = teacher_step_model.predict(to_cut)
                self.transition_states(to_cut, teacher_scores)
                # States that are completed do not need further cutting.
                to_cut = [state for state in to_cut if not state.is_final()]
                length += 1
        return states
    def _init_gold_batch(self, examples, max_length):
        """Make a square batch, of length equal to the shortest transition
        sequence or a cap. A long
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -9,6 +9,7 @@ from typing import (
    Dict,
    Iterable,
    List,
    Literal,
    Optional,
    Tuple,
    Type,
@ -48,7 +49,6 @@ from thinc.api import ConfigValidationError, Model, Optimizer
 from thinc.config import Promise
 from .attrs import NAMES
 from .compat import Literal
 from .lookups import Lookups
 from .util import is_cython_func
@ -181,7 +181,7 @@ def validate_init_settings(
 def validate_token_pattern(obj: list) -> List[str]:
    # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
-    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
    if isinstance(obj, list):
        converted = []
        for pattern in obj:
@ -461,6 +461,27 @@ class ConfigSchemaInit(BaseModel):
        arbitrary_types_allowed = True
 class ConfigSchemaDistillEmpty(BaseModel):
    class Config:
        extra = "forbid"
 class ConfigSchemaDistill(BaseModel):
    # fmt: off
    batcher: Batcher = Field(..., title="Batcher for the training data")
    corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
    dropout: StrictFloat = Field(..., title="Dropout rate")
    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
    max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
    # fmt: on
    class Config:
        extra = "forbid"
        arbitrary_types_allowed = True
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
@ -468,6 +489,7 @@ class ConfigSchema(BaseModel):
    components: Dict[str, Dict[str, Any]]
    corpora: Dict[str, Reader]
    initialize: ConfigSchemaInit
    distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]
    class Config:
        extra = "allow"
@ -479,6 +501,7 @@ CONFIG_SCHEMAS = {
    "training": ConfigSchemaTraining,
    "pretraining": ConfigSchemaPretrain,
    "initialize": ConfigSchemaInit,
    "distillation": ConfigSchemaDistill,
 }
 # Recommendations for init config workflows
--- a/Show More
+++ b/Show More
		`@ -1,3 +0,0 @@`
			`from .entity_linker import EntityLinker_v1`

			`__all__ = ["EntityLinker_v1"]`