Merge branch 'master' into feature/etl

2025-08-02 11:20:19 +03:00 · 2022-10-21 12:46:02 +02:00 · 2022-10-21 12:46:02 +02:00 · 34e8bc620d
commit 34e8bc620d
parent 42b7b8d509 84d9cb6b38
62 changed files with 1900 additions and 823 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -10,6 +10,7 @@ steps:
    inputs:
      versionSpec: ${{ parameters.python_version }}
      architecture: ${{ parameters.architecture }}
+      allowUnstable: true

  - bash: |
      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -85,6 +85,15 @@ jobs:
        Python310Mac:
          imageName: "macos-latest"
          python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11.0-rc.2'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11.0-rc.2'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11.0-rc.2'
      maxParallel: 4
    pool:
      vmImage: $(imageName)
--- a/requirements.txt
+++ b/requirements.txt
@ -15,7 +15,7 @@ pathy>=0.3.5
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
--- a/setup.cfg
+++ b/setup.cfg
@ -56,7 +56,7 @@ install_requires =
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
    jinja2
    # Official Python utilities
    setuptools
--- a/setup.py
+++ b/setup.py
@ -30,7 +30,9 @@ MOD_NAMES = [
    "spacy.lexeme",
    "spacy.vocab",
    "spacy.attrs",
-    "spacy.kb",
+    "spacy.kb.candidate",
+    "spacy.kb.kb",
+    "spacy.kb.kb_in_memory",
    "spacy.ml.parser_model",
    "spacy.morphology",
    "spacy.pipeline.dep_parser",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.4.1"
+__version__ = "3.4.2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -25,6 +25,7 @@ def project_update_dvc_cli(
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
@ -36,7 +37,7 @@ def project_update_dvc_cli(

    DOCS: https://spacy.io/api/cli#project-dvc
    """
-    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)


 def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
    workflow: Optional[str] = None,
    *,
    verbose: bool = False,
+    quiet: bool = False,
    force: bool = False,
 ) -> None:
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
        If not set, the first workflow will be used.
    verbose (bool): Print more info.
+    quiet (bool): Print less info.
    force (bool): Force update DVC config.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(
-        project_dir, config, workflow, verbose=verbose, force=force
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
    )
    help_msg = "To execute the workflow with DVC, run: dvc repro"
    if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
    config: Dict[str, Any],
    workflow: Optional[str] = None,
    verbose: bool = False,
-    silent: bool = False,
+    quiet: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project.yml.
    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
+    quiet (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
@ -105,6 +108,14 @@ def update_dvc_config(
        dvc_config_path.unlink()
    dvc_commands = []
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
    for name in workflows[workflow]:
        command = config_commands[name]
        deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
        if command.get("no_skip"):
            dvc_cmd.append("--always-changed")
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
    with working_dir(path):
-        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, flags=dvc_flags)
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
    return True


-def run_dvc_commands(
-    commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
-) -> None:
-    """Run a sequence of DVC commands in a subprocess, in order.
-
-    commands (List[str]): The string commands without the leading "dvc".
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    """
-    for c in commands:
-        command = split_command(c)
-        dvc_command = ["dvc", *command]
-        # Add the flags if they are set to True
-        for flag, is_active in flags.items():
-            if is_active:
-                dvc_command.append(flag)
-        run_command(dvc_command)
-
-
 def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
    """Validate workflows provided in project.yml and check that a given
    workflow can be used to generate a DVC config.
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -540,6 +540,8 @@ class Errors(metaclass=ErrorsWithCodes):
    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
    E200 = ("Can't set {attr} from Span.")
    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+    E203 = ("If the {name} embedding layer is not updated "
+            "during training, make sure to include it in 'annotating components'")

    # New errors added in v3.x
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
@ -711,9 +713,9 @@ class Errors(metaclass=ErrorsWithCodes):
            "`nlp.enable_pipe` instead.")
    E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
            "property or default function argument?")
-    E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
+    E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
            "but the provided argument {loc} points to a file.")
-    E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
+    E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
    E930 = ("Received invalid get_examples callback in `{method}`. "
            "Expected function that returns an iterable of Example objects but "
            "got: {obj}")
@ -944,8 +946,14 @@ class Errors(metaclass=ErrorsWithCodes):
             "case pass an empty list for the previously not specified argument to avoid this error.")
    E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
             "{value}.")
-    E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
-    E1045 = ("Invalid rich group config '{label}'.")
+    E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
+    E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
+             "method in '{name}'. If you want to use this method, make "
+             "sure it's overwritten on the subclass.")
+    E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
+             "knowledge base, use `InMemoryLookupKB`.")
+    E1047 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
+    E1048 = ("Invalid rich group config '{label}'.")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -0,0 +1,3 @@
+from .kb import KnowledgeBase
+from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, get_candidates, get_candidates_batch
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -0,0 +1,12 @@
+from .kb cimport KnowledgeBase
+from libcpp.vector cimport vector
+from ..typedefs cimport hash_t
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+cdef class Candidate:
+    cdef readonly KnowledgeBase kb
+    cdef hash_t entity_hash
+    cdef float entity_freq
+    cdef vector[float] entity_vector
+    cdef hash_t alias_hash
+    cdef float prior_prob
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -0,0 +1,74 @@
+# cython: infer_types=True, profile=True
+
+from typing import Iterable
+from .kb cimport KnowledgeBase
+from ..tokens import Span
+
+cdef class Candidate:
+    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
+    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate (alias, entity) pair is assigned a certain prior probability.
+
+    DOCS: https://spacy.io/api/kb/#candidate-init
+    """
+
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+        self.kb = kb
+        self.entity_hash = entity_hash
+        self.entity_freq = entity_freq
+        self.entity_vector = entity_vector
+        self.alias_hash = alias_hash
+        self.prior_prob = prior_prob
+
+    @property
+    def entity(self) -> int:
+        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        return self.entity_hash
+
+    @property
+    def entity_(self) -> str:
+        """RETURNS (str): ID/name of this entity in the KB"""
+        return self.kb.vocab.strings[self.entity_hash]
+
+    @property
+    def alias(self) -> int:
+        """RETURNS (uint64): hash of the alias"""
+        return self.alias_hash
+
+    @property
+    def alias_(self) -> str:
+        """RETURNS (str): ID of the original alias"""
+        return self.kb.vocab.strings[self.alias_hash]
+
+    @property
+    def entity_freq(self) -> float:
+        return self.entity_freq
+
+    @property
+    def entity_vector(self) -> Iterable[float]:
+        return self.entity_vector
+
+    @property
+    def prior_prob(self) -> float:
+        return self.prior_prob
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    """
+    Return candidate entities for a given mention and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Span): Entity mention for which to identify candidates.
+    RETURNS (Iterable[Candidate]): Identified candidates.
+    """
+    return kb.get_candidates(mention)
+
+
+def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+    """
+    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Iterable[Span]): Entity mentions for which to identify candidates.
+    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+    """
+    return kb.get_candidates_batch(mentions)
--- a/spacy/kb/kb.pxd
+++ b/spacy/kb/kb.pxd
@ -0,0 +1,10 @@
+"""Knowledge-base for entity or concept linking."""
+
+from cymem.cymem cimport Pool
+from libc.stdint cimport int64_t
+from ..vocab cimport Vocab
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+    cdef readonly Vocab vocab
+    cdef readonly int64_t entity_vector_length
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -0,0 +1,108 @@
+# cython: infer_types=True, profile=True
+
+from pathlib import Path
+from typing import Iterable, Tuple, Union
+from cymem.cymem cimport Pool
+
+from .candidate import Candidate
+from ..tokens import Span
+from ..util import SimpleFrozenList
+from ..errors import Errors
+
+
+cdef class KnowledgeBase:
+    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+    to support entity linking of named entities to real-world concepts.
+    This is an abstract class and requires its operations to be implemented.
+
+    DOCS: https://spacy.io/api/kb
+    """
+
+    def __init__(self, vocab: Vocab, entity_vector_length: int):
+        """Create a KnowledgeBase."""
+        # Make sure abstract KB is not instantiated.
+        if self.__class__ == KnowledgeBase:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
+
+        self.vocab = vocab
+        self.entity_vector_length = entity_vector_length
+        self.mem = Pool()
+
+    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+        """
+        Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
+        and the prior probability of that alias resolving to that entity.
+        If no candidate is found for a given text, an empty list is returned.
+        mentions (Iterable[Span]): Mentions for which to get candidates.
+        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+        """
+        return [self.get_candidates(span) for span in mentions]
+
+    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+        """
+        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+        and the prior probability of that alias resolving to that entity.
+        If the no candidate is found for a given text, an empty list is returned.
+        mention (Span): Mention for which to get candidates.
+        RETURNS (Iterable[Candidate]): Identified candidates.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
+        )
+
+    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
+        """
+        Return vectors for entities.
+        entity (str): Entity name/ID.
+        RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
+        """
+        return [self.get_vector(entity) for entity in entities]
+
+    def get_vector(self, str entity) -> Iterable[float]:
+        """
+        Return vector for entity.
+        entity (str): Entity name/ID.
+        RETURNS (Iterable[float]): Vector for specified entity.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
+        )
+
+    def to_bytes(self, **kwargs) -> bytes:
+        """Serialize the current state to a binary string.
+        RETURNS (bytes): Current state as binary string.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
+        )
+
+    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
+        """Load state from a binary string.
+        bytes_data (bytes): KB state.
+        exclude (Tuple[str]): Properties to exclude when restoring KB.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
+        )
+
+    def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+        """
+        Write KnowledgeBase content to disk.
+        path (Union[str, Path]): Target file path.
+        exclude (Iterable[str]): List of components to exclude.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
+        )
+
+    def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+        """
+        Load KnowledgeBase content from disk.
+        path (Union[str, Path]): Target file path.
+        exclude (Iterable[str]): List of components to exclude.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
+        )
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@ -1,14 +1,12 @@
 """Knowledge-base for entity or concept linking."""
-from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE

-from .vocab cimport Vocab
-from .typedefs cimport hash_t
-from .structs cimport KBEntryC, AliasC
-
+from ..typedefs cimport hash_t
+from ..structs cimport KBEntryC, AliasC
+from .kb cimport KnowledgeBase

 ctypedef vector[KBEntryC] entry_vec
 ctypedef vector[AliasC] alias_vec
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
 ctypedef vector[float_vec] float_matrix


-# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
-cdef class Candidate:
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
-    cdef float entity_freq
-    cdef vector[float] entity_vector
-    cdef hash_t alias_hash
-    cdef float prior_prob
-
-
-cdef class KnowledgeBase:
-    cdef Pool mem
-    cdef readonly Vocab vocab
-    cdef int64_t entity_vector_length
-
+cdef class InMemoryLookupKB(KnowledgeBase):
    # This maps 64bit keys (hash of unique entity string)
    # to 64bit values (position of the _KBEntryC struct in the _entries vector).
    # The PreshMap is pretty space efficient, as it uses open addressing. So
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True
-from typing import Iterator, Iterable, Callable, Dict, Any
+from typing import Iterable, Callable, Dict, Any, Union

 import srsly
-from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from cpython.exc cimport PyErr_SetFromErrno
 from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
@ -12,85 +11,28 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings

-from .typedefs cimport hash_t
-from .errors import Errors, Warnings
-from . import util
-from .util import SimpleFrozenList, ensure_path
-
-cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
-    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
-    algorithm which will disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned to a certain prior probability.
-
-    DOCS: https://spacy.io/api/kb/#candidate_init
-    """
-
-    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
-        self.kb = kb
-        self.entity_hash = entity_hash
-        self.entity_freq = entity_freq
-        self.entity_vector = entity_vector
-        self.alias_hash = alias_hash
-        self.prior_prob = prior_prob
-
-    @property
-    def entity(self):
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
-        return self.entity_hash
-
-    @property
-    def entity_(self):
-        """RETURNS (str): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_hash]
-
-    @property
-    def alias(self):
-        """RETURNS (uint64): hash of the alias"""
-        return self.alias_hash
-
-    @property
-    def alias_(self):
-        """RETURNS (str): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias_hash]
-
-    @property
-    def entity_freq(self):
-        return self.entity_freq
-
-    @property
-    def entity_vector(self):
-        return self.entity_vector
-
-    @property
-    def prior_prob(self):
-        return self.prior_prob
+from ..tokens import Span
+from ..typedefs cimport hash_t
+from ..errors import Errors, Warnings
+from .. import util
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab cimport Vocab
+from .kb cimport KnowledgeBase
+from .candidate import Candidate as Candidate


-def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
-    """
-    Return candidate entities for a given span by using the text of the span as the alias
-    and fetching appropriate entries from the index.
-    This particular function is optimized to work with the built-in KB functionality,
-    but any other custom candidate generation method can be used in combination with the KB as well.
-    """
-    return kb.get_alias_candidates(span.text)
-
-
-cdef class KnowledgeBase:
-    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+cdef class InMemoryLookupKB(KnowledgeBase):
+    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
    to support entity linking of named entities to real-world concepts.

-    DOCS: https://spacy.io/api/kb
+    DOCS: https://spacy.io/api/kb_in_memory
    """

    def __init__(self, Vocab vocab, entity_vector_length):
-        """Create a KnowledgeBase."""
-        self.mem = Pool()
-        self.entity_vector_length = entity_vector_length
+        """Create an InMemoryLookupKB."""
+        super().__init__(vocab, entity_vector_length)
        self._entry_index = PreshMap()
        self._alias_index = PreshMap()
-        self.vocab = vocab
        self._create_empty_vectors(dummy_hash=self.vocab.strings[""])

    def _initialize_entities(self, int64_t nr_entities):
@ -104,11 +46,6 @@ cdef class KnowledgeBase:
        self._alias_index = PreshMap(nr_aliases + 1)
        self._aliases_table = alias_vec(nr_aliases + 1)

-    @property
-    def entity_vector_length(self):
-        """RETURNS (uint64): length of the entity vectors"""
-        return self.entity_vector_length
-
    def __len__(self):
        return self.get_size_entities()

@ -286,7 +223,10 @@ cdef class KnowledgeBase:
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry

-    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
+    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+        return self.get_alias_candidates(mention.text)  # type: ignore
+
+    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
--- a/spacy/lang/ca/lemmatizer.py
+++ b/spacy/lang/ca/lemmatizer.py
@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
                        oov_forms.append(form)
        if not forms:
            forms.extend(oov_forms)
-        if not forms and string in lookup_table.keys():
-            forms.append(self.lookup_lemmatize(token)[0])
+
+        # use lookups, and fall back to the token itself
        if not forms:
-            forms.append(string)
+            forms.append(lookup_table.get(string, [string])[0])
        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -280,7 +280,7 @@ _currency = (
 _punct = (
    r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ۔ ؛ ٪"
 )
-_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
+_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉 〈 〉  ⟦ ⟧'
 _hyphens = "- – — -- --- —— ~"

 # Various symbols like dingbats, but also emoji
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
        rules = rules_table.get(univ_pos, [])
        string = string.lower()
        forms = []
+        # first try lookup in table based on upos
        if string in index:
            forms.append(string)
            self.cache[cache_key] = forms
            return forms
+
+        # then add anything in the exceptions table
        forms.extend(exceptions.get(string, []))
+
+        # if nothing found yet, use the rules
        oov_forms = []
        if not forms:
            for old, new in rules:
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
                        forms.append(form)
                    else:
                        oov_forms.append(form)
+
+        # if still nothing, add the oov forms from rules
        if not forms:
            forms.extend(oov_forms)
-        if not forms and string in lookup_table.keys():
-            forms.append(self.lookup_lemmatize(token)[0])
+
+        # use lookups, which fall back to the token itself
        if not forms:
-            forms.append(string)
+            forms.append(lookup_table.get(string, [string])[0])
        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/grc/init.py
+++ b/spacy/lang/grc/init.py
@ -1,11 +1,15 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults


 class AncientGreekDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS

--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@ -0,0 +1,46 @@
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
+from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from ..char_classes import CONCAT_QUOTES
+
+_prefixes = (
+    [
+        "†",
+        "⸏",
+    ]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)
+
+_suffixes = (
+    LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        "†",
+        "⸎",
+        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
+    ]
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
+    ]
+)
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -28,7 +28,7 @@ class Russian(Language):
    assigns=["token.lemma"],
    default_config={
        "model": None,
-        "mode": "pymorphy2",
+        "mode": "pymorphy3",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -19,11 +19,11 @@ class RussianLemmatizer(Lemmatizer):
        model: Optional[Model],
        name: str = "lemmatizer",
        *,
-        mode: str = "pymorphy2",
+        mode: str = "pymorphy3",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
            try:
                from pymorphy2 import MorphAnalyzer
            except ImportError:
@ -33,6 +33,16 @@ class RussianLemmatizer(Lemmatizer):
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer()
+        elif mode == "pymorphy3":
+            try:
+                from pymorphy3 import MorphAnalyzer
+            except ImportError:
+                raise ImportError(
+                    "The Russian lemmatizer mode 'pymorphy3' requires the "
+                    "pymorphy3 library. Install it with: pip install pymorphy3"
+                ) from None
+            if getattr(self, "_morph", None) is None:
+                self._morph = MorphAnalyzer()
        super().__init__(
            vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
        )
@ -104,6 +114,9 @@ class RussianLemmatizer(Lemmatizer):
            return [analyses[0].normal_form]
        return [string]

+    def pymorphy3_lemmatize(self, token: Token) -> List[str]:
+        return self.pymorphy2_lemmatize(token)
+

 def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
--- a/spacy/lang/sl/init.py
+++ b/spacy/lang/sl/init.py
@ -1,9 +1,17 @@
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ...language import Language, BaseDefaults


 class SlovenianDefaults(BaseDefaults):
    stop_words = STOP_WORDS
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    lex_attr_getters = LEX_ATTRS


 class Slovenian(Language):
--- a/spacy/lang/sl/lex_attrs.py
+++ b/spacy/lang/sl/lex_attrs.py
@ -0,0 +1,145 @@
+from ...attrs import LIKE_NUM
+from ...attrs import IS_CURRENCY
+import unicodedata
+
+
+_num_words = set(
+    """
+	nula ničla nič ena dva tri štiri pet šest sedem osem
+	devet deset enajst dvanajst trinajst štirinajst petnajst
+	šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
+	petdeset šestdest sedemdeset osemdeset devedeset sto tisoč
+	milijon bilijon trilijon kvadrilijon nešteto
+	
+	en eden enega enemu ennem enim enih enima enimi ene eni eno
+	dveh dvema dvem dvoje trije treh trem tremi troje štirje štirih štirim štirimi
+	petih petim petimi šestih šestim šestimi sedmih sedmim sedmimi osmih osmim osmimi
+	devetih devetim devetimi desetih desetim desetimi enajstih enajstim enajstimi
+	dvanajstih dvanajstim dvanajstimi trinajstih trinajstim trinajstimi
+	šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
+	sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
+	devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi  
+	""".split()
+)
+
+_ordinal_words = set(
+    """
+	prvi drugi tretji četrti peti šesti sedmi osmi
+	deveti deseti enajsti dvanajsti trinajsti štirinajsti
+	petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
+	dvajseti trideseti štirideseti petdeseti šestdeseti sedemdeseti
+	osemdeseti devetdeseti stoti tisoči milijonti bilijonti
+	trilijonti kvadrilijonti nešteti
+	
+	prva druga tretja četrta peta šesta sedma osma
+	deveta deseta enajsta dvanajsta trinajsta štirnajsta
+	petnajsta šestnajsta sedemnajsta osemnajsta devetnajsta
+	dvajseta trideseta štirideseta petdeseta šestdeseta sedemdeseta
+	osemdeseta devetdeseta stota tisoča milijonta bilijonta
+	trilijonta kvadrilijonta nešteta
+	
+	prvo drugo tretje četrto peto šestro sedmo osmo
+	deveto deseto enajsto dvanajsto trinajsto štirnajsto
+	petnajsto šestnajsto sedemnajsto osemnajsto devetnajsto
+	dvajseto trideseto štirideseto petdeseto šestdeseto sedemdeseto
+	osemdeseto devetdeseto stoto tisočo milijonto bilijonto
+	trilijonto kvadrilijonto nešteto
+	
+	prvega drugega tretjega četrtega petega šestega sedmega osmega 
+	devega desetega enajstega dvanajstega trinajstega štirnajstega
+	petnajstega šestnajstega sedemnajstega osemnajstega devetnajstega
+	dvajsetega tridesetega štiridesetega petdesetega šestdesetega sedemdesetega
+	osemdesetega devetdesetega stotega tisočega milijontega bilijontega
+	trilijontega kvadrilijontega neštetega
+	
+	prvemu drugemu tretjemu četrtemu petemu šestemu sedmemu osmemu devetemu desetemu 
+	enajstemu dvanajstemu trinajstemu štirnajstemu petnajstemu šestnajstemu sedemnajstemu
+	osemnajstemu devetnajstemu dvajsetemu tridesetemu štiridesetemu petdesetemu šestdesetemu
+	sedemdesetemu osemdesetemu devetdesetemu stotemu tisočemu milijontemu bilijontemu
+	trilijontemu kvadrilijontemu neštetemu
+	
+	prvem drugem tretjem četrtem petem šestem sedmem osmem devetem desetem
+	enajstem dvanajstem trinajstem štirnajstem petnajstem šestnajstem sedemnajstem
+	osemnajstem devetnajstem dvajsetem tridesetem štiridesetem petdesetem šestdesetem
+	sedemdesetem osemdesetem devetdesetem stotem tisočem milijontem bilijontem
+	trilijontem kvadrilijontem neštetem
+	
+	prvim drugim tretjim četrtim petim šestim sedtim osmim devetim desetim
+	enajstim dvanajstim trinajstim štirnajstim petnajstim šestnajstim sedemnajstim
+	osemnajstim devetnajstim dvajsetim tridesetim štiridesetim petdesetim šestdesetim
+	sedemdesetim osemdesetim devetdesetim stotim tisočim milijontim bilijontim
+	trilijontim kvadrilijontim neštetim
+	    
+	prvih drugih tretjih četrthih petih šestih sedmih osmih deveth desetih
+	enajstih dvanajstih trinajstih štirnajstih petnajstih šestnajstih sedemnajstih
+	osemnajstih devetnajstih dvajsetih tridesetih štiridesetih petdesetih šestdesetih
+	sedemdesetih osemdesetih devetdesetih stotih tisočih milijontih bilijontih
+	trilijontih kvadrilijontih nešteth
+	
+	prvima drugima tretjima četrtima petima šestima sedmima osmima devetima desetima
+	enajstima dvanajstima trinajstima štirnajstima petnajstima šestnajstima sedemnajstima
+	osemnajstima devetnajstima dvajsetima tridesetima štiridesetima petdesetima šestdesetima
+	sedemdesetima osemdesetima devetdesetima stotima tisočima milijontima bilijontima
+	trilijontima kvadrilijontima neštetima
+	
+	prve druge četrte pete šeste sedme osme devete desete
+	enajste dvanajste trinajste štirnajste petnajste šestnajste sedemnajste
+	osemnajste devetnajste dvajsete tridesete štiridesete petdesete šestdesete
+	sedemdesete osemdesete devetdesete stote tisoče milijonte bilijonte 
+	trilijonte kvadrilijonte neštete
+	
+	prvimi drugimi tretjimi četrtimi petimi šestimi sedtimi osmimi devetimi desetimi
+	enajstimi dvanajstimi trinajstimi štirnajstimi petnajstimi šestnajstimi sedemnajstimi
+	osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
+	sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
+	trilijontimi kvadrilijontimi neštetimi
+	""".split()
+)
+
+_currency_words = set(
+    """
+	evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
+	cent centa centu cenom centov centoma centih centom cente centi
+	dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
+	tolar tolarja tolarji tolarju tolarjem tolarjev tolarjema tolarjih tolarje tol
+	dinar dinarja dinarji dinarju dinarjem dinarjev dinarjema dinarjih dinarje din
+	funt funta funti funtu funtom funtov funtoma funtih funte gpb
+	forint forinta forinti forintu forintom forintov forintoma forintih forinte
+	zlot zlota zloti zlotu zlotom zlotov zlotoma zlotih zlote 
+	rupij rupija rupiji rupiju rupijem rupijev rupijema rupijih rupije
+	jen jena jeni jenu jenom jenov jenoma jenih jene
+	kuna kuni kune kuno kun kunama kunah kunam kunami
+	marka marki marke markama markah markami 
+	""".split()
+)
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    if text_lower in _ordinal_words:
+        return True
+    return False
+
+
+def is_currency(text):
+    text_lower = text.lower()
+    if text in _currency_words:
+        return True
+    for char in text:
+        if unicodedata.category(char) != "Sc":
+            return False
+    return True
+
+
+LEX_ATTRS = {LIKE_NUM: like_num, IS_CURRENCY: is_currency}
--- a/spacy/lang/sl/punctuation.py
+++ b/spacy/lang/sl/punctuation.py
@ -0,0 +1,84 @@
+from ..char_classes import (
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    HYPHENS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    CURRENCY,
+    UNITS,
+    PUNCT,
+    LIST_CURRENCY,
+    CONCAT_QUOTES,
+)
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import merge_chars
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
+
+
+INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
+
+_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
+
+_suffixes = (
+    INCLUDE_SPECIAL
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+        ),
+        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+        # split initials like J.K. Rowling
+        r"(?<=[A-Z]\.)(?:[A-Z].)",
+    ]
+)
+
+# a list of all suffixes following a hyphen that are shouldn't split (eg. BTC-jev)
+# source: Obeliks tokenizer - https://github.com/clarinsi/obeliks/blob/master/obeliks/res/TokRulesPart1.txt
+CONCAT_QUOTES = CONCAT_QUOTES.replace("'", "")
+HYPHENS_PERMITTED = (
+    "((a)|(evemu)|(evskega)|(i)|(jevega)|(jevska)|(jevskimi)|(jinemu)|(oma)|(ovim)|"
+    "(ovski)|(e)|(evi)|(evskem)|(ih)|(jevem)|(jevske)|(jevsko)|(jini)|(ov)|(ovima)|"
+    "(ovskih)|(em)|(evih)|(evskemu)|(ja)|(jevemu)|(jevskega)|(ji)|(jinih)|(ova)|"
+    "(ovimi)|(ovskim)|(ema)|(evim)|(evski)|(je)|(jevi)|(jevskem)|(jih)|(jinim)|"
+    "(ove)|(ovo)|(ovskima)|(ev)|(evima)|(evskih)|(jem)|(jevih)|(jevskemu)|(jin)|"
+    "(jinima)|(ovega)|(ovska)|(ovskimi)|(eva)|(evimi)|(evskim)|(jema)|(jevim)|"
+    "(jevski)|(jina)|(jinimi)|(ovem)|(ovske)|(ovsko)|(eve)|(evo)|(evskima)|(jev)|"
+    "(jevima)|(jevskih)|(jine)|(jino)|(ovemu)|(ovskega)|(u)|(evega)|(evska)|"
+    "(evskimi)|(jeva)|(jevimi)|(jevskim)|(jinega)|(ju)|(ovi)|(ovskem)|(evem)|"
+    "(evske)|(evsko)|(jeve)|(jevo)|(jevskima)|(jinem)|(om)|(ovih)|(ovskemu)|"
+    "(ovec)|(ovca)|(ovcu)|(ovcem)|(ovcev)|(ovcema)|(ovcih)|(ovci)|(ovce)|(ovcimi)|"
+    "(evec)|(evca)|(evcu)|(evcem)|(evcev)|(evcema)|(evcih)|(evci)|(evce)|(evcimi)|"
+    "(jevec)|(jevca)|(jevcu)|(jevcem)|(jevcev)|(jevcema)|(jevcih)|(jevci)|(jevce)|"
+    "(jevcimi)|(ovka)|(ovke)|(ovki)|(ovko)|(ovk)|(ovkama)|(ovkah)|(ovkam)|(ovkami)|"
+    "(evka)|(evke)|(evki)|(evko)|(evk)|(evkama)|(evkah)|(evkam)|(evkami)|(jevka)|"
+    "(jevke)|(jevki)|(jevko)|(jevk)|(jevkama)|(jevkah)|(jevkam)|(jevkami)|(timi)|"
+    "(im)|(ima)|(a)|(imi)|(e)|(o)|(ega)|(ti)|(em)|(tih)|(emu)|(tim)|(i)|(tima)|"
+    "(ih)|(ta)|(te)|(to)|(tega)|(tem)|(temu))"
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?!{hp}$)(?=[{a}])".format(
+            a=ALPHA, h=HYPHENS, hp=HYPHENS_PERMITTED
+        ),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@ -1,326 +1,84 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# Removed various words that are not normally considered stop words, such as months.

 STOP_WORDS = set(
    """
-a
-ali
-b
-bi
-bil
-bila
-bile
-bili
-bilo
-biti
-blizu
-bo
-bodo
-bolj
-bom
-bomo
-boste
-bova
-boš
-brez
-c
-cel
-cela
-celi
-celo
-d
-da
-daleč
-dan
-danes
-do
-dober
-dobra
-dobri
-dobro
-dokler
-dol
-dovolj
-e
-eden
-en
-ena
-ene
-eni
-enkrat
-eno
-etc.
+a ali 
+
+b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo 
+boste bova boš brez
+
+c cel cela celi celo
+
+č če često četrta četrtek četrti četrto čez čigav
+
+d da daleč dan danes datum deset deseta deseti deseto devet
+deveta deveti deveto do dober dobra dobri dobro dokler dol dolg
+dolga dolgi dovolj drug druga drugi drugo dva dve
+
+e eden en ena ene eni enkrat eno etc.
+
 f
-g
-g.
-ga
-ga.
-gor
-gospa
-gospod
-h
-halo
-i
-idr.
-ii
-iii
-in
-iv
-ix
-iz
-j
-jaz
-je
-ji
-jih
-jim
-jo
-k
-kadarkoli
-kaj
-kajti
-kako
-kakor
-kamor
-kamorkoli
-kar
-karkoli
-katerikoli
-kdaj
-kdo
-kdorkoli
-ker
-ki
-kje
-kjer
-kjerkoli
-ko
-koderkoli
-koga
-komu
-kot
-l
-le
-lep
-lepa
-lepe
-lepi
-lepo
-m
-manj
-me
-med
-medtem
-mene
-mi
-midva
-midve
-mnogo
-moj
-moja
-moje
-mora
-morajo
-moram
-moramo
-morate
-moraš
-morem
-mu
-n
-na
-nad
-naj
-najina
-najino
-najmanj
-naju
-največ
-nam
-nas
-nato
-nazaj
-naš
-naša
-naše
-ne
-nedavno
-nek
-neka
-nekaj
-nekatere
-nekateri
-nekatero
-nekdo
-neke
-nekega
-neki
-nekje
-neko
-nekoga
-nekoč
-ni
-nikamor
-nikdar
-nikjer
-nikoli
-nič
-nje
-njega
-njegov
-njegova
-njegovo
-njej
-njemu
-njen
-njena
-njeno
-nji
-njih
-njihov
-njihova
-njihovo
-njiju
-njim
-njo
-njun
-njuna
-njuno
-no
-nocoj
-npr.
-o
-ob
-oba
-obe
-oboje
-od
-okoli
-on
-onadva
-one
-oni
-onidve
-oz.
-p
-pa
-po
-pod
-pogosto
-poleg
-ponavadi
-ponovno
-potem
-povsod
-prbl.
-precej
-pred
-prej
-preko
-pri
-pribl.
-približno
-proti
-r
-redko
-res
-s
-saj
-sam
-sama
-same
-sami
-samo
-se
-sebe
-sebi
-sedaj
-sem
-seveda
-si
-sicer
-skoraj
-skozi
-smo
-so
-spet
-sta
-ste
-sva
-t
-ta
-tak
-taka
-take
-taki
-tako
-takoj
-tam
-te
-tebe
-tebi
-tega
-ti
-tista
-tiste
-tisti
-tisto
-tj.
-tja
-to
-toda
-tu
-tudi
-tukaj
-tvoj
-tvoja
-tvoje
+
+g g. ga ga. gor gospa gospod 
+
+h halo 
+
+i idr. ii iii in iv ix iz
+
+j jaz je ji jih jim jo jutri
+
+k kadarkoli kaj kajti kako kakor kamor kamorkoli kar karkoli
+katerikoli kdaj kdo kdorkoli ker ki kje kjer kjerkoli
+ko koder koderkoli koga komu kot kratek kratka kratke kratki
+
+l lahka lahke lahki lahko le lep lepa lepe lepi lepo leto
+
+m majhen majhna majhni malce malo manj me med medtem mene
+mesec mi midva midve mnogo moj moja moje mora morajo moram
+moramo morate moraš morem mu
+
+n na nad naj najina najino najmanj naju največ nam narobe
+nas nato nazaj naš naša naše ne nedavno nedelja nek neka
+nekaj nekatere nekateri nekatero nekdo neke nekega neki
+nekje neko nekoga nekoč ni nikamor nikdar nikjer nikoli
+nič nje njega njegov njegova njegovo njej njemu njen
+njena njeno nji njih njihov njihova njihovo njiju njim
+njo njun njuna njuno no nocoj npr.
+
+o ob oba obe oboje od odprt odprta odprti okoli on
+onadva one oni onidve osem osma osmi osmo oz.
+
+p pa pet peta petek peti peto po pod pogosto poleg poln
+polna polni polno ponavadi ponedeljek ponovno potem
+povsod pozdravljen pozdravljeni prav prava prave pravi
+pravo prazen prazna prazno prbl. precej pred prej preko
+pri pribl. približno primer pripravljen pripravljena
+pripravljeni proti prva prvi prvo
+
+r ravno redko res reč
+
+s saj sam sama same sami samo se sebe sebi sedaj sedem
+sedma sedmi sedmo sem seveda si sicer skoraj skozi slab sm
+so sobota spet sreda srednja srednji sta ste stran stvar sva
+
+š šest šesta šesti šesto štiri 
+
+t ta tak taka take taki tako takoj tam te tebe tebi tega
+težak težka težki težko ti tista tiste tisti tisto tj.
+tja to toda torek tretja tretje tretji tri tu tudi tukaj
+tvoj tvoja tvoje
+
 u
-v
-vaju
-vam
-vas
-vaš
-vaša
-vaše
-ve
-vedno
-vendar
-ves
-več
-vi
-vidva
-vii
-viii
-vsa
-vsaj
-vsak
-vsaka
-vsakdo
-vsake
-vsaki
-vsakomur
-vse
-vsega
-vsi
-vso
-včasih
+
+v vaju vam vas vaš vaša vaše ve vedno velik velika veliki
+veliko vendar ves več vi vidva vii viii visok visoka visoke
+visoki vsa vsaj vsak vsaka vsakdo vsake vsaki vsakomur vse
+vsega vsi vso včasih včeraj 
+
 x 
-z
-za
-zadaj
-zadnji
-zakaj
-zdaj
-zelo
-zunaj
-č
-če
-često
-čez
-čigav
-š
-ž
-že
+
+z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
+
+ž že
 """.split()
 )
--- a/spacy/lang/sl/tokenizer_exceptions.py
+++ b/spacy/lang/sl/tokenizer_exceptions.py
@ -0,0 +1,272 @@
+from typing import Dict, List
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...symbols import ORTH, NORM
+from ...util import update_exc
+
+_exc: Dict[str, List[Dict]] = {}
+
+_other_exc = {
+    "t.i.": [{ORTH: "t.", NORM: "tako"}, {ORTH: "i.", NORM: "imenovano"}],
+    "t.j.": [{ORTH: "t.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
+    "T.j.": [{ORTH: "T.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
+    "d.o.o.": [
+        {ORTH: "d.", NORM: "družba"},
+        {ORTH: "o.", NORM: "omejeno"},
+        {ORTH: "o.", NORM: "odgovornostjo"},
+    ],
+    "D.O.O.": [
+        {ORTH: "D.", NORM: "družba"},
+        {ORTH: "O.", NORM: "omejeno"},
+        {ORTH: "O.", NORM: "odgovornostjo"},
+    ],
+    "d.n.o.": [
+        {ORTH: "d.", NORM: "družba"},
+        {ORTH: "n.", NORM: "neomejeno"},
+        {ORTH: "o.", NORM: "odgovornostjo"},
+    ],
+    "D.N.O.": [
+        {ORTH: "D.", NORM: "družba"},
+        {ORTH: "N.", NORM: "neomejeno"},
+        {ORTH: "O.", NORM: "odgovornostjo"},
+    ],
+    "d.d.": [{ORTH: "d.", NORM: "delniška"}, {ORTH: "d.", NORM: "družba"}],
+    "D.D.": [{ORTH: "D.", NORM: "delniška"}, {ORTH: "D.", NORM: "družba"}],
+    "s.p.": [{ORTH: "s.", NORM: "samostojni"}, {ORTH: "p.", NORM: "podjetnik"}],
+    "S.P.": [{ORTH: "S.", NORM: "samostojni"}, {ORTH: "P.", NORM: "podjetnik"}],
+    "l.r.": [{ORTH: "l.", NORM: "lastno"}, {ORTH: "r.", NORM: "ročno"}],
+    "le-te": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "te"}],
+    "Le-te": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "te"}],
+    "le-ti": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ti"}],
+    "Le-ti": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ti"}],
+    "le-to": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "to"}],
+    "Le-to": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "to"}],
+    "le-ta": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ta"}],
+    "Le-ta": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ta"}],
+    "le-tega": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "tega"}],
+    "Le-tega": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "tega"}],
+}
+
+_exc.update(_other_exc)
+
+
+for exc_data in [
+    {ORTH: "adm.", NORM: "administracija"},
+    {ORTH: "aer.", NORM: "aeronavtika"},
+    {ORTH: "agr.", NORM: "agronomija"},
+    {ORTH: "amer.", NORM: "ameriško"},
+    {ORTH: "anat.", NORM: "anatomija"},
+    {ORTH: "angl.", NORM: "angleški"},
+    {ORTH: "ant.", NORM: "antonim"},
+    {ORTH: "antr.", NORM: "antropologija"},
+    {ORTH: "apr.", NORM: "april"},
+    {ORTH: "arab.", NORM: "arabsko"},
+    {ORTH: "arheol.", NORM: "arheologija"},
+    {ORTH: "arhit.", NORM: "arhitektura"},
+    {ORTH: "avg.", NORM: "avgust"},
+    {ORTH: "avstr.", NORM: "avstrijsko"},
+    {ORTH: "avt.", NORM: "avtomobilizem"},
+    {ORTH: "bibl.", NORM: "biblijsko"},
+    {ORTH: "biokem.", NORM: "biokemija"},
+    {ORTH: "biol.", NORM: "biologija"},
+    {ORTH: "bolg.", NORM: "bolgarski"},
+    {ORTH: "bot.", NORM: "botanika"},
+    {ORTH: "cit.", NORM: "citat"},
+    {ORTH: "daj.", NORM: "dajalnik"},
+    {ORTH: "del.", NORM: "deležnik"},
+    {ORTH: "ed.", NORM: "ednina"},
+    {ORTH: "etn.", NORM: "etnografija"},
+    {ORTH: "farm.", NORM: "farmacija"},
+    {ORTH: "filat.", NORM: "filatelija"},
+    {ORTH: "filoz.", NORM: "filozofija"},
+    {ORTH: "fin.", NORM: "finančništvo"},
+    {ORTH: "fiz.", NORM: "fizika"},
+    {ORTH: "fot.", NORM: "fotografija"},
+    {ORTH: "fr.", NORM: "francoski"},
+    {ORTH: "friz.", NORM: "frizerstvo"},
+    {ORTH: "gastr.", NORM: "gastronomija"},
+    {ORTH: "geogr.", NORM: "geografija"},
+    {ORTH: "geol.", NORM: "geologija"},
+    {ORTH: "geom.", NORM: "geometrija"},
+    {ORTH: "germ.", NORM: "germanski"},
+    {ORTH: "gl.", NORM: "glej"},
+    {ORTH: "glag.", NORM: "glagolski"},
+    {ORTH: "glasb.", NORM: "glasba"},
+    {ORTH: "gled.", NORM: "gledališče"},
+    {ORTH: "gost.", NORM: "gostinstvo"},
+    {ORTH: "gozd.", NORM: "gozdarstvo"},
+    {ORTH: "gr.", NORM: "grški"},
+    {ORTH: "grad.", NORM: "gradbeništvo"},
+    {ORTH: "hebr.", NORM: "hebrejsko"},
+    {ORTH: "hrv.", NORM: "hrvaško"},
+    {ORTH: "ide.", NORM: "indoevropsko"},
+    {ORTH: "igr.", NORM: "igre"},
+    {ORTH: "im.", NORM: "imenovalnik"},
+    {ORTH: "iron.", NORM: "ironično"},
+    {ORTH: "it.", NORM: "italijanski"},
+    {ORTH: "itd.", NORM: "in tako dalje"},
+    {ORTH: "itn.", NORM: "in tako naprej"},
+    {ORTH: "ipd.", NORM: "in podobno"},
+    {ORTH: "jap.", NORM: "japonsko"},
+    {ORTH: "jul.", NORM: "julij"},
+    {ORTH: "jun.", NORM: "junij"},
+    {ORTH: "kit.", NORM: "kitajsko"},
+    {ORTH: "knj.", NORM: "knjižno"},
+    {ORTH: "knjiž.", NORM: "knjižno"},
+    {ORTH: "kor.", NORM: "koreografija"},
+    {ORTH: "lat.", NORM: "latinski"},
+    {ORTH: "les.", NORM: "lesna stroka"},
+    {ORTH: "lingv.", NORM: "lingvistika"},
+    {ORTH: "lit.", NORM: "literarni"},
+    {ORTH: "ljubk.", NORM: "ljubkovalno"},
+    {ORTH: "lov.", NORM: "lovstvo"},
+    {ORTH: "m.", NORM: "moški"},
+    {ORTH: "mak.", NORM: "makedonski"},
+    {ORTH: "mar.", NORM: "marec"},
+    {ORTH: "mat.", NORM: "matematika"},
+    {ORTH: "med.", NORM: "medicina"},
+    {ORTH: "meh.", NORM: "mehiško"},
+    {ORTH: "mest.", NORM: "mestnik"},
+    {ORTH: "mdr.", NORM: "med drugim"},
+    {ORTH: "min.", NORM: "mineralogija"},
+    {ORTH: "mitol.", NORM: "mitologija"},
+    {ORTH: "mn.", NORM: "množina"},
+    {ORTH: "mont.", NORM: "montanistika"},
+    {ORTH: "muz.", NORM: "muzikologija"},
+    {ORTH: "nam.", NORM: "namenilnik"},
+    {ORTH: "nar.", NORM: "narečno"},
+    {ORTH: "nav.", NORM: "navadno"},
+    {ORTH: "nedol.", NORM: "nedoločnik"},
+    {ORTH: "nedov.", NORM: "nedovršni"},
+    {ORTH: "neprav.", NORM: "nepravilno"},
+    {ORTH: "nepreh.", NORM: "neprehodno"},
+    {ORTH: "neskl.", NORM: "nesklonljiv(o)"},
+    {ORTH: "nestrok.", NORM: "nestrokovno"},
+    {ORTH: "num.", NORM: "numizmatika"},
+    {ORTH: "npr.", NORM: "na primer"},
+    {ORTH: "obrt.", NORM: "obrtništvo"},
+    {ORTH: "okt.", NORM: "oktober"},
+    {ORTH: "or.", NORM: "orodnik"},
+    {ORTH: "os.", NORM: "oseba"},
+    {ORTH: "otr.", NORM: "otroško"},
+    {ORTH: "oz.", NORM: "oziroma"},
+    {ORTH: "pal.", NORM: "paleontologija"},
+    {ORTH: "papir.", NORM: "papirništvo"},
+    {ORTH: "ped.", NORM: "pedagogika"},
+    {ORTH: "pisar.", NORM: "pisarniško"},
+    {ORTH: "pog.", NORM: "pogovorno"},
+    {ORTH: "polit.", NORM: "politika"},
+    {ORTH: "polj.", NORM: "poljsko"},
+    {ORTH: "poljud.", NORM: "poljudno"},
+    {ORTH: "preg.", NORM: "pregovor"},
+    {ORTH: "preh.", NORM: "prehodno"},
+    {ORTH: "pren.", NORM: "preneseno"},
+    {ORTH: "prid.", NORM: "pridevnik"},
+    {ORTH: "prim.", NORM: "primerjaj"},
+    {ORTH: "prisl.", NORM: "prislov"},
+    {ORTH: "psih.", NORM: "psihologija"},
+    {ORTH: "psiht.", NORM: "psihiatrija"},
+    {ORTH: "rad.", NORM: "radiotehnika"},
+    {ORTH: "rač.", NORM: "računalništvo"},
+    {ORTH: "rib.", NORM: "ribištvo"},
+    {ORTH: "rod.", NORM: "rodilnik"},
+    {ORTH: "rus.", NORM: "rusko"},
+    {ORTH: "s.", NORM: "srednji"},
+    {ORTH: "sam.", NORM: "samostalniški"},
+    {ORTH: "sed.", NORM: "sedanjik"},
+    {ORTH: "sep.", NORM: "september"},
+    {ORTH: "slabš.", NORM: "slabšalno"},
+    {ORTH: "slovan.", NORM: "slovansko"},
+    {ORTH: "slovaš.", NORM: "slovaško"},
+    {ORTH: "srb.", NORM: "srbsko"},
+    {ORTH: "star.", NORM: "starinsko"},
+    {ORTH: "stil.", NORM: "stilno"},
+    {ORTH: "sv.", NORM: "svet(i)"},
+    {ORTH: "teh.", NORM: "tehnika"},
+    {ORTH: "tisk.", NORM: "tiskarstvo"},
+    {ORTH: "tj.", NORM: "to je"},
+    {ORTH: "tož.", NORM: "tožilnik"},
+    {ORTH: "trg.", NORM: "trgovina"},
+    {ORTH: "ukr.", NORM: "ukrajinski"},
+    {ORTH: "um.", NORM: "umetnost"},
+    {ORTH: "vel.", NORM: "velelnik"},
+    {ORTH: "vet.", NORM: "veterina"},
+    {ORTH: "vez.", NORM: "veznik"},
+    {ORTH: "vn.", NORM: "visokonemško"},
+    {ORTH: "voj.", NORM: "vojska"},
+    {ORTH: "vrtn.", NORM: "vrtnarstvo"},
+    {ORTH: "vulg.", NORM: "vulgarno"},
+    {ORTH: "vznes.", NORM: "vzneseno"},
+    {ORTH: "zal.", NORM: "založništvo"},
+    {ORTH: "zastar.", NORM: "zastarelo"},
+    {ORTH: "zgod.", NORM: "zgodovina"},
+    {ORTH: "zool.", NORM: "zoologija"},
+    {ORTH: "čeb.", NORM: "čebelarstvo"},
+    {ORTH: "češ.", NORM: "češki"},
+    {ORTH: "člov.", NORM: "človeškost"},
+    {ORTH: "šah.", NORM: "šahovski"},
+    {ORTH: "šalj.", NORM: "šaljivo"},
+    {ORTH: "šp.", NORM: "španski"},
+    {ORTH: "špan.", NORM: "špansko"},
+    {ORTH: "šport.", NORM: "športni"},
+    {ORTH: "štev.", NORM: "števnik"},
+    {ORTH: "šved.", NORM: "švedsko"},
+    {ORTH: "švic.", NORM: "švicarsko"},
+    {ORTH: "ž.", NORM: "ženski"},
+    {ORTH: "žarg.", NORM: "žargonsko"},
+    {ORTH: "žel.", NORM: "železnica"},
+    {ORTH: "živ.", NORM: "živost"},
+]:
+    _exc[exc_data[ORTH]] = [exc_data]
+
+
+abbrv = """
+Co. Ch. DIPL. DR. Dr. Ev. Inc. Jr. Kr. Mag. M. MR. Mr. Mt. Murr. Npr. OZ. 
+Opr. Osn. Prim. Roj. ST. Sim. Sp. Sred. St. Sv. Škofl. Tel. UR. Zb. 
+a. aa. ab. abc. abit. abl. abs. abt. acc. accel. add. adj. adv. aet. afr. akad. al. alban. all. alleg. 
+alp. alt. alter. alžir. am. an. andr. ang. anh. anon. ans. antrop. apoc. app. approx. apt. ar. arc. arch. 
+arh. arr. as. asist. assist. assoc. asst. astr. attn. aug. avstral. az. b. bab. bal. bbl. bd. belg. bioinf. 
+biomed. bk. bl. bn. borg. bp. br. braz. brit. bros. broš. bt. bu. c. ca. cal. can. cand. cantab. cap. capt.
+cat. cath. cc. cca. cd. cdr. cdre. cent. cerkv. cert. cf. cfr. ch. chap. chem. chr. chs. cic. circ. civ. cl.
+cm. cmd. cnr. co. cod. col. coll. colo. com. comp. con. conc. cond. conn. cons. cont. coop. corr. cost. cp.
+cpl. cr. crd. cres. cresc. ct. cu. d. dan. dat. davč. ddr. dec. ded. def. dem. dent. dept. dia. dip. dipl. 
+dir. disp. diss. div. do. doc. dok. dol. doo. dop. dott. dr. dram. druž. družb. drž. dt. duh. dur. dvr. dwt. e.
+ea. ecc. eccl. eccles. econ. edn. egipt. egr. ekon. eksp. el. em. enc. eng. eo. ep. err. esp. esq. est.
+et. etc. etnogr. etnol. ev. evfem. evr. ex. exc. excl. exp. expl. ext. exx. f. fa. facs. fak. faks. fas.
+fasc. fco. fcp. feb. febr. fec. fed. fem. ff. fff. fid. fig. fil. film. fiziol. fiziot. flam. fm. fo. fol. folk.
+frag. fran. franc. fsc. g. ga. gal. gdč. ge. gen. geod. geog. geotehnol. gg. gimn. glas. glav. gnr. go. gor.
+gosp. gp. graf. gram. gren. grš. gs. h. hab. hf. hist. ho. hort. i. ia. ib. ibid. id. idr. idridr. ill. imen.
+imp. impf. impr. in. inc. incl. ind. indus. inf. inform. ing. init. ins. int. inv. inšp. inštr. inž. is. islam.
+ist. ital. iur. iz. izbr. izd. izg. izgr. izr. izv. j. jak. jam. jan. jav. je. jez. jr. jsl. jud. jug.
+jugoslovan. jur. juž. jv. jz. k. kal. kan. kand. kat. kdo. kem. kip. kmet. kol. kom. komp. konf. kont. kost. kov. 
+kp. kpfw. kr. kraj. krat. kub. kult. kv. kval. l. la. lab. lb. ld. let. lib. lik. litt. lj. ljud. ll. loc. log. 
+loč. lt. ma. madž. mag. manag. manjš. masc. mass. mater. max. maxmax. mb. md. mech. medic. medij. medn. 
+mehč. mem. menedž. mes. mess. metal. meteor. meteorol. mex. mi. mikr. mil. minn. mio. misc. miss. mit. mk. 
+mkt. ml. mlad. mlle. mlr. mm. mme. množ. mo. moj. moš. možn. mr. mrd. mrs. ms. msc. msgr. mt. murr. mus. mut. 
+n. na. nad. nadalj. nadom. nagl. nakl. namer. nan. naniz. nasl. nat. navt. nač. ned. nem. nik. nizoz. nm. nn. 
+no. nom. norv. notr. nov. novogr. ns. o. ob. obd. obj. oblač. obl. oblik. obr. obraz. obs. obst. obt. obč. oc. 
+oct. od. odd. odg. odn. odst. odv. oec. off. ok. okla. okr. ont. oo. op. opis. opp. opr. orch. ord. ore. oreg. 
+org. orient. orig. ork. ort. oseb. osn. ot. ozir. ošk. p. pag. par. para. parc. parl. part. past. pat. pdk. 
+pen. perf. pert. perz. pesn. pet. pev. pf. pfc. ph. pharm. phil. pis. pl. po. pod. podr. podaljš. pogl. pogoj. pojm. 
+pok. pokr. pol. poljed. poljub. polu. pom. pomen. pon. ponov. pop. por. port. pos. posl. posn. pov. pp. ppl. pr. 
+praet. prav. pravopis. pravosl. preb. pred. predl. predm. predp. preds. pref. pregib. prel. prem. premen. prep. 
+pres. pret. prev. pribl. prih. pril. primerj. primor. prip. pripor. prir. prist. priv. proc. prof. prog. proiz. 
+prom. pron. prop. prot. protest. prov. ps. pss. pt. publ. pz. q. qld. qu. quad. que. r. racc. rastl. razgl. 
+razl. razv. rd. red. ref. reg. rel. relig. rep. repr. rer. resp. rest. ret. rev. revol. rež. rim. rist. rkp. rm. 
+roj. rom. romun. rp. rr. rt. rud. ruš. ry. sal. samogl. san. sc. scen. sci. scr. sdv. seg. sek. sen. sept. ser. 
+sev. sg. sgt. sh. sig. sigg. sign. sim. sin. sing. sinh. skand. skl. sklad. sklanj. sklep. skr. sl. slik. slov. 
+slovak. slovn. sn. so. sob. soc. sociol. sod. sopomen. sopr. sor. sov. sovj. sp. spec. spl. spr. spreg. sq. sr. 
+sre. sred. sredoz. srh. ss. ssp. st. sta. stan. stanstar. stcsl. ste. stim. stol. stom. str. stroj. strok. stsl. 
+stud. sup. supl. suppl. svet. sz. t. tab. tech. ted. tehn. tehnol. tek. teks. tekst. tel. temp. ten. teol. ter. 
+term. test. th. theol. tim. tip. tisočl. tit. tl. tol. tolmač. tom. tor. tov. tr. trad. traj. trans. tren. 
+trib. tril. trop. trp. trž. ts. tt. tu. tur. turiz. tvor. tvorb. tč. u. ul. umet. un. univ. up. upr. ur. urad. 
+us. ust. utr. v. va. val. var. varn. ven. ver. verb. vest. vezal. vic. vis. viv. viz. viš. vod. vok. vol. vpr. 
+vrst. vrstil. vs. vv. vzd. vzg. vzh. vzor. w. wed. wg. wk. x. y. z. zah. zaim. zak. zap. zasl. zavar. zač. zb. 
+združ. zg. zn. znan. znanstv. zoot. zun. zv. zvd. á. é. ć. č. čas. čet. čl. člen. čustv. đ. ľ. ł. ş. ŠT. š. šir. 
+škofl. škot. šol. št. števil. štud. ů. ű. žen. žival. 
+""".split()
+
+for orth in abbrv:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -29,7 +29,7 @@ class Ukrainian(Language):
    assigns=["token.lemma"],
    default_config={
        "model": None,
-        "mode": "pymorphy2",
+        "mode": "pymorphy3",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -14,11 +14,11 @@ class UkrainianLemmatizer(RussianLemmatizer):
        model: Optional[Model],
        name: str = "lemmatizer",
        *,
-        mode: str = "pymorphy2",
+        mode: str = "pymorphy3",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
            try:
                from pymorphy2 import MorphAnalyzer
            except ImportError:
@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer(lang="uk")
+        elif mode == "pymorphy3":
+            try:
+                from pymorphy3 import MorphAnalyzer
+            except ImportError:
+                raise ImportError(
+                    "The Ukrainian lemmatizer mode 'pymorphy3' requires the "
+                    "pymorphy3 library and dictionaries. Install them with: "
+                    "pip install pymorphy3 pymorphy3-dicts-uk"
+                ) from None
+            if getattr(self, "_morph", None) is None:
+                self._morph = MorphAnalyzer(lang="uk")
        super().__init__(
            vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
        )
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -1,11 +1,12 @@
 from pathlib import Path
 from typing import Optional, Callable, Iterable, List, Tuple
 from thinc.types import Floats2d
-from thinc.api import chain, clone, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
+from thinc.api import chain, list2ragged, reduce_mean, residual
+from thinc.api import Model, Maxout, Linear, tuplify, Ragged

 from ...util import registry
-from ...kb import KnowledgeBase, Candidate, get_candidates
+from ...kb import KnowledgeBase, InMemoryLookupKB
+from ...kb import Candidate, get_candidates, get_candidates_batch
 from ...vocab import Vocab
 from ...tokens import Span, Doc
 from ..extract_spans import extract_spans
@ -78,9 +79,11 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab


@registry.misc("spacy.KBFromFile.v1")
-def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
-    def kb_from_file(vocab):
-        kb = KnowledgeBase(vocab, entity_vector_length=1)
+def load_kb(
+    kb_path: Path,
+) -> Callable[[Vocab], KnowledgeBase]:
+    def kb_from_file(vocab: Vocab):
+        kb = InMemoryLookupKB(vocab, entity_vector_length=1)
        kb.from_disk(kb_path)
        return kb

@ -88,9 +91,11 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:


@registry.misc("spacy.EmptyKB.v1")
-def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
-    def empty_kb_factory(vocab):
-        return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
+def empty_kb(
+    entity_vector_length: int,
+) -> Callable[[Vocab], KnowledgeBase]:
+    def empty_kb_factory(vocab: Vocab):
+        return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)

    return empty_kb_factory

@ -98,3 +103,10 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1")
 def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
    return get_candidates
+
+
+@registry.misc("spacy.CandidateBatchGenerator.v1")
+def create_candidates_batch() -> Callable[
+    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+]:
+    return get_candidates_batch
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -1,7 +1,6 @@
 from typing import cast, Any, Callable, Dict, Iterable, List, Optional
-from typing import Sequence, Tuple, Union
+from typing import Tuple
 from collections import Counter
-from copy import deepcopy
 from itertools import islice
 import numpy as np

@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [
-                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
-            ]
+            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
            assert len(guesses) == n_docs
            return guesses
        scores = self.model.predict(docs)
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -53,9 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_context": True,
        "entity_vector_length": 64,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+        "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
        "overwrite": True,
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
+        "candidates_batch_size": 1,
        "threshold": None,
    },
    default_score_weights={
@ -75,9 +77,13 @@ def make_entity_linker(
    incl_context: bool,
    entity_vector_length: int,
    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+    get_candidates_batch: Callable[
+        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    ],
    overwrite: bool,
    scorer: Optional[Callable],
    use_gold_ents: bool,
+    candidates_batch_size: int,
    threshold: Optional[float] = None,
 ):
    """Construct an EntityLinker component.
@ -90,17 +96,21 @@ def make_entity_linker(
    incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
    incl_context (bool): Whether or not to include the local context in the model.
    entity_vector_length (int): Size of encoding vectors in the KB.
-    get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+    get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
        produces a list of candidates, given a certain knowledge base and a textual mention.
+    get_candidates_batch (
+        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
    scorer (Optional[Callable]): The scoring method.
    use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
        component must provide entity annotations.
+    candidates_batch_size (int): Size of batches for entity candidate generation.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
        prediction is discarded. If None, predictions are not filtered by any threshold.
    """

    if not model.attrs.get("include_span_maker", False):
-        # The only difference in arguments here is that use_gold_ents is not available
+        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
        return EntityLinker_v1(
            nlp.vocab,
            model,
@ -124,9 +134,11 @@ def make_entity_linker(
        incl_context=incl_context,
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
+        get_candidates_batch=get_candidates_batch,
        overwrite=overwrite,
        scorer=scorer,
        use_gold_ents=use_gold_ents,
+        candidates_batch_size=candidates_batch_size,
        threshold=threshold,
    )

@ -160,9 +172,13 @@ class EntityLinker(TrainablePipe):
        incl_context: bool,
        entity_vector_length: int,
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+        get_candidates_batch: Callable[
+            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        ],
        overwrite: bool = BACKWARD_OVERWRITE,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
+        candidates_batch_size: int,
        threshold: Optional[float] = None,
    ) -> None:
        """Initialize an entity linker.
@ -178,10 +194,14 @@ class EntityLinker(TrainablePipe):
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            Scorer.score_links.
+        get_candidates_batch (
+            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+            Iterable[Candidate]]
+            ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
+        candidates_batch_size (int): Size of batches for entity candidate generation.
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
        DOCS: https://spacy.io/api/entitylinker#init
@ -204,22 +224,27 @@ class EntityLinker(TrainablePipe):
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
+        self.get_candidates_batch = get_candidates_batch
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        # how many neighbour sentences to take into account
-        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
+        # create an empty KB by default
        self.kb = empty_kb(entity_vector_length)(self.vocab)
        self.scorer = scorer
        self.use_gold_ents = use_gold_ents
+        self.candidates_batch_size = candidates_batch_size
        self.threshold = threshold

+        if candidates_batch_size < 1:
+            raise ValueError(Errors.E1044)
+
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
        create it using this object's vocab."""
        if not callable(kb_loader):
            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))

-        self.kb = kb_loader(self.vocab)
+        self.kb = kb_loader(self.vocab)  # type: ignore

    def validate_kb(self) -> None:
        # Raise an error if the knowledge base is not initialized.
@ -241,8 +266,8 @@ class EntityLinker(TrainablePipe):
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
-            Note that providing this argument, will overwrite all data accumulated in the current KB.
+        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
+            instance. Note that providing this argument will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.

        DOCS: https://spacy.io/api/entitylinker#initialize
@ -419,66 +444,93 @@ class EntityLinker(TrainablePipe):
            if len(doc) == 0:
                continue
            sentences = [s for s in doc.sents]
-            # Looping through each entity (TODO: rewrite)
-            for ent in doc.ents:
-                sent_index = sentences.index(ent.sent)
-                assert sent_index >= 0

-                if self.incl_context:
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    sentence_encoding = self.model.predict([sent_doc])[0]
-                    sentence_encoding_t = sentence_encoding.T
-                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                entity_count += 1
-                if ent.label_ in self.labels_discard:
-                    # ignoring this entity - setting to NIL
-                    final_kb_ids.append(self.NIL)
-                else:
-                    candidates = list(self.get_candidates(self.kb, ent))
-                    if not candidates:
-                        # no prediction possible for this entity - setting to NIL
-                        final_kb_ids.append(self.NIL)
-                    elif len(candidates) == 1 and self.threshold is None:
-                        # shortcut for efficiency reasons: take the 1 candidate
-                        final_kb_ids.append(candidates[0].entity_)
-                    else:
-                        random.shuffle(candidates)
-                        # set all prior probabilities to 0 if incl_prior=False
-                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                        if not self.incl_prior:
-                            prior_probs = xp.asarray([0.0 for _ in candidates])
-                        scores = prior_probs
-                        # add in similarity from the context
-                        if self.incl_context:
-                            entity_encodings = xp.asarray(
-                                [c.entity_vector for c in candidates]
-                            )
-                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                            if len(entity_encodings) != len(prior_probs):
-                                raise RuntimeError(
-                                    Errors.E147.format(
-                                        method="predict",
-                                        msg="vectors not of equal length",
-                                    )
-                                )
-                            # cosine similarity
-                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                sentence_norm * entity_norm
-                            )
-                            if sims.shape != prior_probs.shape:
-                                raise ValueError(Errors.E161)
-                            scores = prior_probs + sims - (prior_probs * sims)
-                        final_kb_ids.append(
-                            candidates[scores.argmax().item()].entity_
-                            if self.threshold is None or scores.max() >= self.threshold
-                            else EntityLinker.NIL
+            # Loop over entities in batches.
+            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+                # Look up candidate entities.
+                valid_ent_idx = [
+                    idx
+                    for idx in range(len(ent_batch))
+                    if ent_batch[idx].label_ not in self.labels_discard
+                ]
+
+                batch_candidates = list(
+                    self.get_candidates_batch(
+                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
+                    )
+                    if self.candidates_batch_size > 1
+                    else [
+                        self.get_candidates(self.kb, ent_batch[idx])
+                        for idx in valid_ent_idx
+                    ]
+                )
+
+                # Looping through each entity in batch (TODO: rewrite)
+                for j, ent in enumerate(ent_batch):
+                    sent_index = sentences.index(ent.sent)
+                    assert sent_index >= 0
+
+                    if self.incl_context:
+                        # get n_neighbour sentences, clipped to the length of the document
+                        start_sentence = max(0, sent_index - self.n_sents)
+                        end_sentence = min(
+                            len(sentences) - 1, sent_index + self.n_sents
                        )
+                        start_token = sentences[start_sentence].start
+                        end_token = sentences[end_sentence].end
+                        sent_doc = doc[start_token:end_token].as_doc()
+                        # currently, the context is the same for each entity in a sentence (should be refined)
+                        sentence_encoding = self.model.predict([sent_doc])[0]
+                        sentence_encoding_t = sentence_encoding.T
+                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                    entity_count += 1
+                    if ent.label_ in self.labels_discard:
+                        # ignoring this entity - setting to NIL
+                        final_kb_ids.append(self.NIL)
+                    else:
+                        candidates = list(batch_candidates[j])
+                        if not candidates:
+                            # no prediction possible for this entity - setting to NIL
+                            final_kb_ids.append(self.NIL)
+                        elif len(candidates) == 1 and self.threshold is None:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            final_kb_ids.append(candidates[0].entity_)
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                            if not self.incl_prior:
+                                prior_probs = xp.asarray([0.0 for _ in candidates])
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
+                                    )
+                                # cosine similarity
+                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+                                    sentence_norm * entity_norm
+                                )
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = prior_probs + sims - (prior_probs * sims)
+                            final_kb_ids.append(
+                                candidates[scores.argmax().item()].entity_
+                                if self.threshold is None
+                                or scores.max() >= self.threshold
+                                else EntityLinker.NIL
+                            )
+
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@ -68,8 +68,7 @@ class EntityLinker_v1(TrainablePipe):
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            Scorer.score_links.
+        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        DOCS: https://spacy.io/api/entitylinker#init
        """
        self.vocab = vocab
@ -115,7 +114,7 @@ class EntityLinker_v1(TrainablePipe):
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
+        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
            Note that providing this argument, will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.

--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -26,17 +26,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
 hidden_size = 128

 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 96
 rows = [5000, 2000, 1000, 1000]
 attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
@ -133,6 +133,9 @@ def make_spancat(
    spans_key (str): Key of the doc.spans dict to save the spans under. During
        initialization and training, the component will look for spans on the
        reference document under the same key.
+    scorer (Optional[Callable]): The scoring method. Defaults to
+        Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+        spans allowed.
    threshold (float): Minimum probability to consider a prediction positive.
        Spans with a positive prediction will be saved on the Doc. Defaults to
        0.5.
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -19,7 +19,7 @@ multi_label_default_config = """
@architectures = "spacy.TextCatEnsemble.v2"

 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 include_static_vectors = false

 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
@ -96,8 +96,8 @@ def make_multilabel_textcat(
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
-) -> "TextCategorizer":
-    """Create a TextCategorizer component. The text categorizer predicts categories
+) -> "MultiLabel_TextCategorizer":
+    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
    to be non-mutually exclusive, which means that there can be zero or more labels
    per doc).
@ -105,6 +105,7 @@ def make_multilabel_textcat(
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
+    scorer (Optional[Callable]): The scoring method.
    """
    return MultiLabel_TextCategorizer(
        nlp.vocab, model, name, threshold=threshold, scorer=scorer
@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        name (str): The component instance name, used to add entries to the
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method.

        DOCS: https://spacy.io/api/textcategorizer#init
        """
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
            width = self.model.get_dim("nO")
            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners:
-            listener.receive(batch_id, tokvecs, _empty_backprop)
        return tokvecs

    def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
@ -286,8 +283,19 @@ class Tok2VecListener(Model):
 def forward(model: Tok2VecListener, inputs, is_train: bool):
    """Supply the outputs from the upstream Tok2Vec component."""
    if is_train:
-        model.verify_inputs(inputs)
-        return model._outputs, model._backprop
+        # This might occur during training when the tok2vec layer is frozen / hasn't been updated.
+        # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
+        if model._batch_id is None:
+            outputs = []
+            for doc in inputs:
+                if doc.tensor.size == 0:
+                    raise ValueError(Errors.E203.format(name="tok2vec"))
+                else:
+                    outputs.append(doc.tensor)
+            return outputs, _empty_backprop
+        else:
+            model.verify_inputs(inputs)
+            return model._outputs, model._backprop
    else:
        # This is pretty grim, but it's hard to do better :(.
        # It's hard to avoid relying on the doc.tensor attribute, because the
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
                outputs.append(model.ops.alloc2f(len(doc), width))
            else:
                outputs.append(doc.tensor)
-        return outputs, lambda dX: []
+        return outputs, _empty_backprop


 def _empty_backprop(dX):  # for pickling
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
    IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
    IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
-    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
-    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
-    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
-    LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
-    GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
-    LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+    EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
+    NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
+    GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
+    LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
+    GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
+    LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")

    class Config:
        extra = "forbid"
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
    # fmt: off
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
    url: Optional[StrictStr] = Field(None, title="URL of asset")
-    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    description: StrictStr = Field("", title="Description of asset")
    # fmt: on

@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
 class ProjectConfigAssetGit(BaseModel):
    # fmt: off
    git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
-    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    description: Optional[StrictStr] = Field(None, title="Description of asset")
    # fmt: on

@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
        None, title="Indices of sentences' start and end indices"
    )
    text: StrictStr = Field(..., title="Document text")
-    spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
-        None, title="Span information - end/start indices, label, KB ID"
-    )
+    spans: Optional[
+        Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
+    ] = Field(None, title="Span information - end/start indices, label, KB ID")
    tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
        ..., title="Token information - ID, start, annotations"
    )
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
        title="Any custom data stored in the document's _ attribute",
        alias="_",
    )
-    underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+    underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
        None, title="Any custom data stored in the token's _ attribute"
    )
-    underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+    underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
        None, title="Any custom data stored in the span's _ attribute"
    )
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -333,16 +333,24 @@ def ro_tokenizer():

@pytest.fixture(scope="session")
 def ru_tokenizer():
-    pytest.importorskip("pymorphy2")
+    pytest.importorskip("pymorphy3")
    return get_lang_class("ru")().tokenizer


@pytest.fixture
 def ru_lemmatizer():
-    pytest.importorskip("pymorphy2")
+    pytest.importorskip("pymorphy3")
    return get_lang_class("ru")().add_pipe("lemmatizer")


+@pytest.fixture
+def ru_lookup_lemmatizer():
+    pytest.importorskip("pymorphy2")
+    return get_lang_class("ru")().add_pipe(
+        "lemmatizer", config={"mode": "pymorphy2_lookup"}
+    )
+
+
@pytest.fixture(scope="session")
 def sa_tokenizer():
    return get_lang_class("sa")().tokenizer
@ -411,15 +419,24 @@ def ky_tokenizer():

@pytest.fixture(scope="session")
 def uk_tokenizer():
-    pytest.importorskip("pymorphy2")
+    pytest.importorskip("pymorphy3")
    return get_lang_class("uk")().tokenizer


@pytest.fixture
 def uk_lemmatizer():
+    pytest.importorskip("pymorphy3")
+    pytest.importorskip("pymorphy3_dicts_uk")
+    return get_lang_class("uk")().add_pipe("lemmatizer")
+
+
+@pytest.fixture
+def uk_lookup_lemmatizer():
    pytest.importorskip("pymorphy2")
    pytest.importorskip("pymorphy2_dicts_uk")
-    return get_lang_class("uk")().add_pipe("lemmatizer")
+    return get_lang_class("uk")().add_pipe(
+        "lemmatizer", config={"mode": "pymorphy2_lookup"}
+    )


@pytest.fixture(scope="session")
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    doc[0:1]._.span_test = "span_attribute"
+    doc[0:2]._.span_test = "span_attribute_2"
    doc[0]._.token_test = 117
+    doc[1]._.token_test = 118
    doc.spans["span_group"] = [doc[0:1]]
    json_doc = doc.to_json(
        underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
    assert json_doc["_"]["json_test2"] == [1, 2, 3]
    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["token_test"]["value"] == 117
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+    assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+    assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
    assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc

@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
    assert json_doc["_"]["json_test"] == "hello world"
    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["token_test"]["value"] == 117
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
    assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc

@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
    assert json_doc["_"]["my_ext"] == "hello world"
    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["my_ext"]["value"] == 117
-    assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
+    assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
    assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc

@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
    doc[0]._.token_test = 117
    json_doc = doc.to_json(underscore=["span_test"])

-    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
-    assert "token_test" not in json_doc["underscore_token"]
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+    assert "underscore_token" not in json_doc
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0


@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    doc[0:1]._.span_test = "span_attribute"
+    doc[0:2]._.span_test = "span_attribute_2"
    doc[0]._.token_test = 117
+    doc[1]._.token_test = 118

    json_doc = doc.to_json(
        underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
    assert new_doc._.json_test1 == "hello world"
    assert new_doc._.json_test2 == [1, 2, 3]
    assert new_doc[0]._.token_test == 117
+    assert new_doc[1]._.token_test == 118
    assert new_doc[0:1]._.span_test == "span_attribute"
+    assert new_doc[0:2]._.span_test == "span_attribute_2"
    assert new_doc.user_data == doc.user_data
    assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
        exclude=["user_data"]
--- a/spacy/tests/lang/grc/test_tokenizer.py
+++ b/spacy/tests/lang/grc/test_tokenizer.py
@ -0,0 +1,18 @@
+import pytest
+
+
+# fmt: off
+GRC_TOKEN_EXCEPTION_TESTS = [
+    ("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]),
+    ("τὴν δὲ τῶν Αἰγυπτίων φιλοσοφίαν εἶναι τοιαύτην περί τε †θεῶν† καὶ ὑπὲρ δικαιοσύνης.", ["τὴν", "δὲ", "τῶν", "Αἰγυπτίων", "φιλοσοφίαν", "εἶναι", "τοιαύτην", "περί", "τε", "†", "θεῶν", "†", "καὶ", "ὑπὲρ", "δικαιοσύνης", "."]),
+    ("⸏πόσις δ' Ἐρεχθεύς ἐστί μοι σεσωσμένος⸏", ["⸏", "πόσις", "δ'", "Ἐρεχθεύς", "ἐστί", "μοι", "σεσωσμένος", "⸏"]),
+    ("⸏ὔπνον ἴδωμεν⸎", ["⸏", "ὔπνον", "ἴδωμεν", "⸎"]),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", GRC_TOKEN_EXCEPTION_TESTS)
+def test_grc_tokenizer(grc_tokenizer, text, expected_tokens):
+    tokens = grc_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
    doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
+
+
+def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
+    words = ["мама", "мыла", "раму"]
+    pos = ["NOUN", "VERB", "NOUN"]
+    morphs = [
+        "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
+        "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
+        "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
+    ]
+    doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = ru_lookup_lemmatizer(doc)
+    lemmas = [token.lemma_ for token in doc]
+    assert lemmas == ["мама", "мыла", "раму"]
--- a/spacy/tests/lang/sl/test_text.py
+++ b/spacy/tests/lang/sl/test_text.py
@ -20,7 +20,6 @@ od katerih so te svoboščine odvisne,
    assert len(tokens) == 116


-@pytest.mark.xfail
 def test_ordinal_number(sl_tokenizer):
    text = "10. decembra 1948"
    tokens = sl_tokenizer(text)
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
    uk_lemmatizer(doc)
+    assert [token.lemma for token in doc]
+
+
+def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
+    """Check that the lookup uk lemmatizer runs."""
+    doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
+    uk_lookup_lemmatizer(doc)
+    assert [token.lemma for token in doc]
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -6,7 +6,7 @@ from numpy.testing import assert_equal
 from spacy import registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, KnowledgeBase, get_candidates
+from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.pipeline import EntityLinker
@ -34,7 +34,7 @@ def assert_almost_equal(a, b):
 def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
@ -51,7 +51,7 @@ def test_issue4674():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.to_disk(str(file_path))
-        kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+        kb2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))
    assert kb2.get_size_entities() == 1

@ -59,9 +59,9 @@ def test_issue4674():
@pytest.mark.issue(6730)
 def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
-    from spacy.kb import KnowledgeBase
+    from spacy.kb.kb_in_memory import InMemoryLookupKB

-    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
+    kb = InMemoryLookupKB(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

    with pytest.raises(ValueError):
@ -127,7 +127,7 @@ def test_issue7065_b():

    def create_kb(vocab):
        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="No. 8",
@ -190,7 +190,7 @@ def test_no_entities():

    def create_kb(vocab):
        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
        return mykb
@ -231,7 +231,7 @@ def test_partial_links():

    def create_kb(vocab):
        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
        return mykb
@ -263,7 +263,7 @@ def test_partial_links():

 def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)

    # adding entities
    mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
@ -292,7 +292,7 @@ def test_kb_valid_entities(nlp):

 def test_kb_invalid_entities(nlp):
    """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@ -308,7 +308,7 @@ def test_kb_invalid_entities(nlp):

 def test_kb_invalid_probabilities(nlp):
    """Test the invalid construction of a KB with wrong prior probabilities"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@ -322,7 +322,7 @@ def test_kb_invalid_probabilities(nlp):

 def test_kb_invalid_combination(nlp):
    """Test the invalid construction of a KB with non-matching entity and probability lists"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@ -338,7 +338,7 @@ def test_kb_invalid_combination(nlp):

 def test_kb_invalid_entity_vector(nlp):
    """Test the invalid construction of a KB with non-matching entity vector lengths"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)

    # adding entities
    mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
@ -376,7 +376,7 @@ def test_kb_initialize_empty(nlp):

 def test_kb_serialize(nlp):
    """Test serialization of the KB"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
@ -393,12 +393,12 @@ def test_kb_serialize(nlp):
@pytest.mark.issue(9137)
 def test_kb_serialize_2(nlp):
    v = [5, 6, 7, 8]
-    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+    kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E1"], [1], [v])
    assert kb1.get_vector("E1") == v
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
-        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+        kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert kb2.get_vector("E1") == v

@ -408,7 +408,7 @@ def test_kb_set_entities(nlp):
    v = [5, 6, 7, 8]
    v1 = [1, 1, 1, 0]
    v2 = [2, 2, 2, 3]
-    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+    kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E0"], [1], [v])
    assert kb1.get_entity_strings() == ["E0"]
    kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
@ -417,7 +417,7 @@ def test_kb_set_entities(nlp):
    assert kb1.get_vector("E2") == v2
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
-        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+        kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert set(kb2.get_entity_strings()) == {"E1", "E2"}
        assert kb2.get_vector("E1") == v1
@ -428,7 +428,7 @@ def test_kb_serialize_vocab(nlp):
    """Test serialization of the KB and custom strings"""
    entity = "MyFunnyID"
    assert entity not in nlp.vocab.strings
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
    assert not mykb.contains_entity(entity)
    mykb.add_entity(entity, freq=342, entity_vector=[3])
    assert mykb.contains_entity(entity)
@ -436,14 +436,14 @@ def test_kb_serialize_vocab(nlp):
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
-        mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
+        mykb_new = InMemoryLookupKB(Vocab(), entity_vector_length=1)
        mykb_new.from_disk(d / "kb")
        assert entity in mykb_new.vocab.strings


 def test_candidate_generation(nlp):
    """Test correct candidate generation"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
    doc = nlp("douglas adam Adam shrubbery")

    douglas_ent = doc[0:1]
@ -481,7 +481,7 @@ def test_el_pipe_configuration(nlp):
    ruler.add_patterns([pattern])

    def create_kb(vocab):
-        kb = KnowledgeBase(vocab, entity_vector_length=1)
+        kb = InMemoryLookupKB(vocab, entity_vector_length=1)
        kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
        kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
        kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
@ -500,10 +500,21 @@ def test_el_pipe_configuration(nlp):
    def get_lowercased_candidates(kb, span):
        return kb.get_alias_candidates(span.text.lower())

+    def get_lowercased_candidates_batch(kb, spans):
+        return [get_lowercased_candidates(kb, span) for span in spans]
+
    @registry.misc("spacy.LowercaseCandidateGenerator.v1")
-    def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
+    def create_candidates() -> Callable[
+        [InMemoryLookupKB, "Span"], Iterable[Candidate]
+    ]:
        return get_lowercased_candidates

+    @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
+    def create_candidates_batch() -> Callable[
+        [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
+    ]:
+        return get_lowercased_candidates_batch
+
    # replace the pipe with a new one with with a different candidate generator
    entity_linker = nlp.replace_pipe(
        "entity_linker",
@ -511,6 +522,9 @@ def test_el_pipe_configuration(nlp):
        config={
            "incl_context": False,
            "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
+            "get_candidates_batch": {
+                "@misc": "spacy.LowercaseCandidateBatchGenerator.v1"
+            },
        },
    )
    entity_linker.set_kb(create_kb)
@ -532,7 +546,7 @@ def test_nel_nsents(nlp):

 def test_vocab_serialization(nlp):
    """Test that string information is retained across storage"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@ -552,7 +566,7 @@ def test_vocab_serialization(nlp):

    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
-        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+        kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")

        candidates = kb_new_vocab.get_alias_candidates("adam")
@ -568,7 +582,7 @@ def test_vocab_serialization(nlp):

 def test_append_alias(nlp):
    """Test that we can append additional alias-entity pairs"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@ -599,7 +613,7 @@ def test_append_alias(nlp):
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_append_invalid_alias(nlp):
    """Test that append an alias will throw an error if prior probs are exceeding 1"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@ -621,7 +635,7 @@ def test_preserving_links_asdoc(nlp):
    vector_length = 1

    def create_kb(vocab):
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        # adding entities
        mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
        mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
@ -723,7 +737,7 @@ def test_overfitting_IO():
        # create artificial KB - assign same prior weight to the two russ cochran's
        # Q2146908 (Russ Cochran): American golfer
        # Q7381115 (Russ Cochran): publisher
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
@ -805,7 +819,7 @@ def test_kb_serialization():
        kb_dir = tmp_dir / "kb"
        nlp1 = English()
        assert "Q2146908" not in nlp1.vocab.strings
-        mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(nlp1.vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
        assert "Q2146908" in nlp1.vocab.strings
@ -828,7 +842,7 @@ def test_kb_serialization():
 def test_kb_pickle():
    # Test that the KB can be pickled
    nlp = English()
-    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    assert not kb_1.contains_alias("Russ Cochran")
    kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
@ -842,7 +856,7 @@ def test_kb_pickle():
 def test_nel_pickle():
    # Test that a pipeline with an EL component can be pickled
    def create_kb(vocab):
-        kb = KnowledgeBase(vocab, entity_vector_length=3)
+        kb = InMemoryLookupKB(vocab, entity_vector_length=3)
        kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
        return kb
@ -864,7 +878,7 @@ def test_nel_pickle():
 def test_kb_to_bytes():
    # Test that the KB's to_bytes method works correctly
    nlp = English()
-    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
    kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
@ -874,7 +888,7 @@ def test_kb_to_bytes():
    )
    assert kb_1.contains_alias("Russ Cochran")
    kb_bytes = kb_1.to_bytes()
-    kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    kb_2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
    assert not kb_2.contains_alias("Russ Cochran")
    kb_2 = kb_2.from_bytes(kb_bytes)
    # check that both KBs are exactly the same
@ -897,7 +911,7 @@ def test_kb_to_bytes():
 def test_nel_to_bytes():
    # Test that a pipeline with an EL component can be converted to bytes
    def create_kb(vocab):
-        kb = KnowledgeBase(vocab, entity_vector_length=3)
+        kb = InMemoryLookupKB(vocab, entity_vector_length=3)
        kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
        return kb
@ -987,7 +1001,7 @@ def test_legacy_architectures(name, config):
        train_examples.append(Example.from_dict(doc, annotation))

    def create_kb(vocab):
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
@ -1054,7 +1068,7 @@ def test_no_gold_ents(patterns):

    def create_kb(vocab):
        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Kirby", ["Q613241"], [0.9])
        # Placeholder
@ -1104,7 +1118,7 @@ def test_tokenization_mismatch():

    def create_kb(vocab):
        # create placeholder KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Kirby", ["Q613241"], [0.9])
        return mykb
@ -1121,6 +1135,12 @@ def test_tokenization_mismatch():
    nlp.evaluate(train_examples)


+def test_abstract_kb_instantiation():
+    """Test whether instantiation of abstract KB base class fails."""
+    with pytest.raises(TypeError):
+        KnowledgeBase(None, 3)
+
+
 # fmt: off
@pytest.mark.parametrize(
    "meet_threshold,config",
@ -1151,7 +1171,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):

    def create_kb(vocab):
        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=3)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=3)
        mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias(
            alias="Mahler",
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -230,6 +230,97 @@ def test_tok2vec_listener_callback():
    assert get_dX(Y) is not None


+def test_tok2vec_listener_overfitting():
+    """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
+    assert losses["tagger"] < 0.00001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].tag_ == "N"
+        assert doc2[1].tag_ == "V"
+        assert doc2[2].tag_ == "J"
+        assert doc2[3].tag_ == "N"
+
+
+def test_tok2vec_frozen_not_annotating():
+    """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(2):
+        losses = {}
+        with pytest.raises(
+            ValueError, match=r"the tok2vec embedding layer is not updated"
+        ):
+            nlp.update(
+                train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
+            )
+
+
+def test_tok2vec_frozen_overfitting():
+    """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(100):
+        losses = {}
+        nlp.update(
+            train_examples,
+            sgd=optimizer,
+            losses=losses,
+            exclude=["tok2vec"],
+            annotates=["tok2vec"],
+        )
+    assert losses["tagger"] < 0.0001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].tag_ == "N"
+        assert doc2[1].tag_ == "V"
+        assert doc2[2].tag_ == "J"
+        assert doc2[3].tag_ == "N"
+
+
 def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
--- a/spacy/tests/serialize/test_resource_warning.py
+++ b/spacy/tests/serialize/test_resource_warning.py
@ -3,7 +3,7 @@ from unittest import TestCase
 import pytest
 import srsly
 from numpy import zeros
-from spacy.kb import KnowledgeBase, Writer
+from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
 from spacy.vectors import Vectors
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
@ -71,7 +71,7 @@ def entity_linker():
    nlp = Language()

    def create_kb(vocab):
-        kb = KnowledgeBase(vocab, entity_vector_length=1)
+        kb = InMemoryLookupKB(vocab, entity_vector_length=1)
        kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
        return kb

@ -120,7 +120,7 @@ def test_writer_with_path_py35():

 def test_save_and_load_knowledge_base():
    nlp = Language()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        path = d / "kb"
        try:
@ -129,7 +129,7 @@ def test_save_and_load_knowledge_base():
            pytest.fail(str(e))

        try:
-            kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+            kb_loaded = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
            kb_loaded.from_disk(path)
        except Exception as e:
            pytest.fail(str(e))
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -2,7 +2,7 @@ from typing import Callable

 from spacy import util
 from spacy.util import ensure_path, registry, load_model_from_config
-from spacy.kb import KnowledgeBase
+from spacy.kb.kb_in_memory import InMemoryLookupKB
 from spacy.vocab import Vocab
 from thinc.api import Config

@ -22,7 +22,7 @@ def test_serialize_kb_disk(en_vocab):
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb1.to_disk(str(file_path))
-        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
+        kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))

    # final assertions
@ -30,7 +30,7 @@ def test_serialize_kb_disk(en_vocab):


 def _get_dummy_kb(vocab):
-    kb = KnowledgeBase(vocab, entity_vector_length=3)
+    kb = InMemoryLookupKB(vocab, entity_vector_length=3)
    kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
    kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
    kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
@ -104,7 +104,7 @@ def test_serialize_subclassed_kb():
    custom_field = 666
    """

-    class SubKnowledgeBase(KnowledgeBase):
+    class SubInMemoryLookupKB(InMemoryLookupKB):
        def __init__(self, vocab, entity_vector_length, custom_field):
            super().__init__(vocab, entity_vector_length)
            self.custom_field = custom_field
@ -112,9 +112,9 @@ def test_serialize_subclassed_kb():
    @registry.misc("spacy.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
-    ) -> Callable[[Vocab], KnowledgeBase]:
+    ) -> Callable[[Vocab], InMemoryLookupKB]:
        def custom_kb_factory(vocab):
-            kb = SubKnowledgeBase(
+            kb = SubInMemoryLookupKB(
                vocab=vocab,
                entity_vector_length=entity_vector_length,
                custom_field=custom_field,
@ -129,7 +129,7 @@ def test_serialize_subclassed_kb():
    nlp.initialize()

    entity_linker = nlp.get_pipe("entity_linker")
-    assert type(entity_linker.kb) == SubKnowledgeBase
+    assert type(entity_linker.kb) == SubInMemoryLookupKB
    assert entity_linker.kb.entity_vector_length == 342
    assert entity_linker.kb.custom_field == 666

@ -139,6 +139,6 @@ def test_serialize_subclassed_kb():
        nlp2 = util.load_model_from_path(tmp_dir)
        entity_linker2 = nlp2.get_pipe("entity_linker")
        # After IO, the KB is the standard one
-        assert type(entity_linker2.kb) == KnowledgeBase
+        assert type(entity_linker2.kb) == InMemoryLookupKB
        assert entity_linker2.kb.entity_vector_length == 342
        assert not hasattr(entity_linker2.kb, "custom_field")
--- a/spacy/tokens/_dict_proxies.py
+++ b/spacy/tokens/_dict_proxies.py
@ -42,7 +42,8 @@ class SpanGroups(UserDict):
    def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
        if doc is None:
            doc = self._ensure_doc()
-        return SpanGroups(doc).from_bytes(self.to_bytes())
+        data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
+        return SpanGroups(doc, items=data_copy)

    def setdefault(self, key, default=None):
        if not isinstance(default, SpanGroup):
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1609,24 +1609,20 @@ cdef class Doc:
                Doc.set_extension(attr)
            self._.set(attr, doc_json["_"][attr])

-        if doc_json.get("underscore_token", {}):
-            for token_attr in doc_json["underscore_token"]:
-                token_start = doc_json["underscore_token"][token_attr]["token_start"]
-                value = doc_json["underscore_token"][token_attr]["value"]
+        for token_attr in doc_json.get("underscore_token", {}):
+            if not Token.has_extension(token_attr):
+                Token.set_extension(token_attr)
+            for token_data in doc_json["underscore_token"][token_attr]:
+                start = token_by_char(self.c, self.length, token_data["start"])
+                value = token_data["value"]
+                self[start]._.set(token_attr, value)
                
-                if not Token.has_extension(token_attr):
-                    Token.set_extension(token_attr)
-                self[token_start]._.set(token_attr, value)
-                
-        if doc_json.get("underscore_span", {}):
-            for span_attr in doc_json["underscore_span"]:
-                token_start = doc_json["underscore_span"][span_attr]["token_start"]
-                token_end = doc_json["underscore_span"][span_attr]["token_end"]
-                value = doc_json["underscore_span"][span_attr]["value"]
-
-                if not Span.has_extension(span_attr):
-                    Span.set_extension(span_attr)
-                self[token_start:token_end]._.set(span_attr, value)
+        for span_attr in doc_json.get("underscore_span", {}):
+            if not Span.has_extension(span_attr):
+                Span.set_extension(span_attr)
+            for span_data in doc_json["underscore_span"][span_attr]:
+                value = span_data["value"]
+                self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
        return self

    def to_json(self, underscore=None):
@ -1674,30 +1670,34 @@ cdef class Doc:
        if underscore:
            user_keys = set()
            if self.user_data:
-                data["_"] = {}
-                data["underscore_token"] = {}
-                data["underscore_span"] = {}
-                for data_key in self.user_data:
+                for data_key, value in self.user_data.copy().items():
                    if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
                        attr = data_key[1]
                        start = data_key[2]
                        end = data_key[3]
                        if attr in underscore:
                            user_keys.add(attr)
-                            value = self.user_data[data_key]
                            if not srsly.is_json_serializable(value):
                                raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
                            # Check if doc attribute
                            if start is None:
+                                if "_" not in data:
+                                    data["_"] = {}
                                data["_"][attr] = value
                            # Check if token attribute
                            elif end is None:
+                                if "underscore_token" not in data:
+                                    data["underscore_token"] = {}
                                if attr not in data["underscore_token"]:
-                                    data["underscore_token"][attr] = {"token_start": start, "value": value}
+                                    data["underscore_token"][attr] = []
+                                data["underscore_token"][attr].append({"start": start, "value": value})
                            # Else span attribute
                            else:
+                                if "underscore_span" not in data:
+                                    data["underscore_span"] = {}
                                if attr not in data["underscore_span"]:
-                                    data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
+                                    data["underscore_span"][attr] = []
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value})

            for attr in underscore:
                if attr not in user_keys:
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterable
+from typing import Any, Dict, Iterable, Optional
 from .doc import Doc
 from .span import Span

@ -24,4 +24,4 @@ class SpanGroup:
    def __getitem__(self, i: int) -> Span: ...
    def to_bytes(self) -> bytes: ...
    def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
-    def copy(self) -> SpanGroup: ...
+    def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@ -241,15 +241,18 @@ cdef class SpanGroup:
    cdef void push_back(self, SpanC span) nogil:
        self.c.push_back(span)

-    def copy(self)  -> SpanGroup:
+    def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
        """Clones the span group.

+        doc (Doc): New reference document to which the copy is bound.
        RETURNS (SpanGroup): A copy of the span group.

        DOCS: https://spacy.io/api/spangroup#copy
        """
+        if doc is None:
+            doc = self.doc
        return SpanGroup(
-            self.doc,
+            doc,
            name=self.name,
            attrs=deepcopy(self.attrs),
            spans=list(self),
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
 </Infobox>

 ```cli
-$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
+$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
 ```

 > #### Example
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 | `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
 | `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                   |
 | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~                                                           |
+| `--quiet`, `-q`   | Print no output generated by DVC. ~~bool (flag)~~                                                             |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |

--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -14,7 +14,8 @@ entities) to unique identifiers, grounding the named entities into the "real
 world". It requires a `KnowledgeBase`, as well as a function to generate
 plausible candidates from that `KnowledgeBase` given a certain textual mention,
 and a machine learning model to pick the right candidate, given the local
-context of the mention.
+context of the mention. `EntityLinker` defaults to using the
+[`InMemoryLookupKB`](/api/kb_in_memory) implementation.

 ## Assigned Attributes {#assigned-attributes}

@ -170,7 +171,7 @@ with the current vocab.
 >
 > ```python
 > def create_kb(vocab):
->     kb = KnowledgeBase(vocab, entity_vector_length=128)
+>     kb = InMemoryLookupKB(vocab, entity_vector_length=128)
 >     kb.add_entity(...)
 >     kb.add_alias(...)
 >     return kb
--- a/website/docs/api/kb.md
+++ b/website/docs/api/kb.md
@ -4,27 +4,45 @@ teaser:
  A storage class for entities and aliases of a specific knowledge base
  (ontology)
 tag: class
-source: spacy/kb.pyx
+source: spacy/kb/kb.pyx
 new: 2.2
 ---

-The `KnowledgeBase` object provides a method to generate
-[`Candidate`](/api/kb/#candidate) objects, which are plausible external
+The `KnowledgeBase` object is an abstract class providing a method to generate
+[`Candidate`](/api/kb#candidate) objects, which are plausible external
 identifiers given a certain textual mention. Each such `Candidate` holds
 information from the relevant KB entities, such as its frequency in text and
 possible aliases. Each entity in the knowledge base also has a pretrained entity
 vector of a fixed size.

+Beyond that, `KnowledgeBase` classes have to implement a number of utility
+functions called by the [`EntityLinker`](/api/entitylinker) component.
+
+<Infobox variant="warning">
+
+This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
+implementation up to that point is available as `InMemoryLookupKB` from 3.5
+onwards.
+
+</Infobox>
+
 ## KnowledgeBase.\_\_init\_\_ {#init tag="method"}

-Create the knowledge base.
+`KnowledgeBase` is an abstract class and cannot be instantiated. Its child
+classes should call `__init__()` to set up some necessary attributes.

 > #### Example
 >
 > ```python
 > from spacy.kb import KnowledgeBase
+> from spacy.vocab import Vocab
+>
+> class FullyImplementedKB(KnowledgeBase):
+>   def __init__(self, vocab: Vocab, entity_vector_length: int):
+>       super().__init__(vocab, entity_vector_length)
+>       ...
 > vocab = nlp.vocab
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
 > ```

 | Name                   | Description                                      |
@ -40,133 +58,66 @@ The length of the fixed-size entity vectors in the knowledge base.
 | ----------- | ------------------------------------------------ |
 | **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |

-## KnowledgeBase.add_entity {#add_entity tag="method"}
+## KnowledgeBase.get_candidates {#get_candidates tag="method"}

-Add an entity to the knowledge base, specifying its corpus frequency and entity
-vector, which should be of length
-[`entity_vector_length`](/api/kb#entity_vector_length).
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate).

 > #### Example
 >
 > ```python
-> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
-> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates(doc[0:2])
 > ```

-| Name            | Description                                                |
-| --------------- | ---------------------------------------------------------- |
-| `entity`        | The unique entity identifier. ~~str~~                      |
-| `freq`          | The frequency of the entity in a typical corpus. ~~float~~ |
-| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~     |
+| Name        | Description                                                          |
+| ----------- | -------------------------------------------------------------------- |
+| `mention`   | The textual mention or alias. ~~Span~~                               |
+| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |

-## KnowledgeBase.set_entities {#set_entities tag="method"}
+## KnowledgeBase.get_candidates_batch {#get_candidates_batch tag="method"}

-Define the full list of entities in the knowledge base, specifying the corpus
-frequency and entity vector for each entity.
+Same as [`get_candidates()`](/api/kb#get_candidates), but for an arbitrary
+number of mentions. The [`EntityLinker`](/api/entitylinker) component will call
+`get_candidates_batch()` instead of `get_candidates()`, if the config parameter
+`candidates_batch_size` is greater or equal than 1.
+
+The default implementation of `get_candidates_batch()` executes
+`get_candidates()` in a loop. We recommend implementing a more efficient way to
+retrieve candidates for multiple mentions at once, if performance is of concern
+to you.

 > #### Example
 >
 > ```python
-> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates((doc[0:2], doc[3:]))
 > ```

-| Name          | Description                                                      |
-| ------------- | ---------------------------------------------------------------- |
-| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
-| `freq_list`   | List of entity frequencies. ~~Iterable[int]~~                    |
-| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~              |
-
-## KnowledgeBase.add_alias {#add_alias tag="method"}
-
-Add an alias or mention to the knowledge base, specifying its potential KB
-identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb#add_entity) or
-[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
-should not exceed 1. Note that an empty string can not be used as alias.
-
-> #### Example
->
-> ```python
-> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
-> ```
-
-| Name            | Description                                                                       |
-| --------------- | --------------------------------------------------------------------------------- |
-| `alias`         | The textual mention or alias. Can not be the empty string. ~~str~~                |
-| `entities`      | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
-| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~                       |
-
-## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
-
-Get the total number of entities in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_entities = len(kb)
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
-
-## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
-
-Get a list of all entity IDs in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_entities = kb.get_entity_strings()
-> ```
-
-| Name        | Description                                               |
-| ----------- | --------------------------------------------------------- |
-| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
-
-## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
-
-Get the total number of aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_aliases = kb.get_size_aliases()
-> ```
-
-| Name        | Description                                          |
-| ----------- | ---------------------------------------------------- |
-| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
-
-## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
-
-Get a list of all aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_aliases = kb.get_alias_strings()
-> ```
-
-| Name        | Description                                              |
-| ----------- | -------------------------------------------------------- |
-| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
+| Name        | Description                                                                                  |
+| ----------- | -------------------------------------------------------------------------------------------- |
+| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |

 ## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}

-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb/#candidate).
+<Infobox variant="warning">
+This method is _not_ available from spaCy 3.5 onwards.
+</Infobox>

-> #### Example
->
-> ```python
-> candidates = kb.get_alias_candidates("Douglas")
-> ```
-
-| Name        | Description                                                   |
-| ----------- | ------------------------------------------------------------- |
-| `alias`     | The textual mention or alias. ~~str~~                         |
-| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
+[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
+more flexibility in customizing knowledge bases. Some of its methods were moved
+to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
+being `get_alias_candidates()`. This method is now available as
+[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+defaults to
+[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).

 ## KnowledgeBase.get_vector {#get_vector tag="method"}

@ -178,27 +129,30 @@ Given a certain entity ID, retrieve its pretrained entity vector.
 > vector = kb.get_vector("Q42")
 > ```

-| Name        | Description                          |
-| ----------- | ------------------------------------ |
-| `entity`    | The entity ID. ~~str~~               |
-| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| `entity`    | The entity ID. ~~str~~                 |
+| **RETURNS** | The entity vector. ~~Iterable[float]~~ |

-## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
+## KnowledgeBase.get_vectors {#get_vectors tag="method"}

-Given a certain entity ID and a certain textual mention, retrieve the prior
-probability of the fact that the mention links to the entity ID.
+Same as [`get_vector()`](/api/kb#get_vector), but for an arbitrary number of
+entity IDs.
+
+The default implementation of `get_vectors()` executes `get_vector()` in a loop.
+We recommend implementing a more efficient way to retrieve vectors for multiple
+entities at once, if performance is of concern to you.

 > #### Example
 >
 > ```python
-> probability = kb.get_prior_prob("Q42", "Douglas")
+> vectors = kb.get_vectors(("Q42", "Q3107329"))
 > ```

-| Name        | Description                                                               |
-| ----------- | ------------------------------------------------------------------------- |
-| `entity`    | The entity ID. ~~str~~                                                    |
-| `alias`     | The textual mention or alias. ~~str~~                                     |
-| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
+| `entities`  | The entity IDs. ~~Iterable[str]~~                         |
+| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |

 ## KnowledgeBase.to_disk {#to_disk tag="method"}

@ -207,12 +161,13 @@ Save the current state of the knowledge base to a directory.
 > #### Example
 >
 > ```python
-> kb.to_disk(loc)
+> kb.to_disk(path)
 > ```

-| Name  | Description                                                                                                                                |
-| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| Name      | Description                                                                                                                                |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path`    | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~                                                                                           |

 ## KnowledgeBase.from_disk {#from_disk tag="method"}

@ -222,16 +177,16 @@ Restore the state of the knowledge base from a given directory. Note that the
 > #### Example
 >
 > ```python
-> from spacy.kb import KnowledgeBase
 > from spacy.vocab import Vocab
 > vocab = Vocab().from_disk("/path/to/vocab")
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
 > kb.from_disk("/path/to/kb")
 > ```

 | Name        | Description                                                                                     |
 | ----------- | ----------------------------------------------------------------------------------------------- |
 | `loc`       | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                |
 | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          |

 ## Candidate {#candidate tag="class"}
--- a/website/docs/api/kb_in_memory.md
+++ b/website/docs/api/kb_in_memory.md
@ -0,0 +1,302 @@
+---
+title: InMemoryLookupKB
+teaser:
+  The default implementation of the KnowledgeBase interface. Stores all
+  information in-memory.
+tag: class
+source: spacy/kb/kb_in_memory.pyx
+new: 3.5
+---
+
+The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
+implements all of its methods. It stores all KB data in-memory and generates
+[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
+entity names. It's highly optimized for both a low memory footprint and speed of
+retrieval.
+
+## InMemoryLookupKB.\_\_init\_\_ {#init tag="method"}
+
+Create the knowledge base.
+
+> #### Example
+>
+> ```python
+> from spacy.kb import InMemoryLookupKB
+> vocab = nlp.vocab
+> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
+> ```
+
+| Name                   | Description                                      |
+| ---------------------- | ------------------------------------------------ |
+| `vocab`                | The shared vocabulary. ~~Vocab~~                 |
+| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
+
+## InMemoryLookupKB.entity_vector_length {#entity_vector_length tag="property"}
+
+The length of the fixed-size entity vectors in the knowledge base.
+
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
+
+## InMemoryLookupKB.add_entity {#add_entity tag="method"}
+
+Add an entity to the knowledge base, specifying its corpus frequency and entity
+vector, which should be of length
+[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+
+> #### Example
+>
+> ```python
+> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
+> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
+> ```
+
+| Name            | Description                                                |
+| --------------- | ---------------------------------------------------------- |
+| `entity`        | The unique entity identifier. ~~str~~                      |
+| `freq`          | The frequency of the entity in a typical corpus. ~~float~~ |
+| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~     |
+
+## InMemoryLookupKB.set_entities {#set_entities tag="method"}
+
+Define the full list of entities in the knowledge base, specifying the corpus
+frequency and entity vector for each entity.
+
+> #### Example
+>
+> ```python
+> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
+> ```
+
+| Name          | Description                                                      |
+| ------------- | ---------------------------------------------------------------- |
+| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
+| `freq_list`   | List of entity frequencies. ~~Iterable[int]~~                    |
+| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~              |
+
+## InMemoryLookupKB.add_alias {#add_alias tag="method"}
+
+Add an alias or mention to the knowledge base, specifying its potential KB
+identifiers and their prior probabilities. The entity identifiers should refer
+to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
+or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+probabilities should not exceed 1. Note that an empty string can not be used as
+alias.
+
+> #### Example
+>
+> ```python
+> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
+> ```
+
+| Name            | Description                                                                       |
+| --------------- | --------------------------------------------------------------------------------- |
+| `alias`         | The textual mention or alias. Can not be the empty string. ~~str~~                |
+| `entities`      | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
+| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~                       |
+
+## InMemoryLookupKB.\_\_len\_\_ {#len tag="method"}
+
+Get the total number of entities in the knowledge base.
+
+> #### Example
+>
+> ```python
+> total_entities = len(kb)
+> ```
+
+| Name        | Description                                           |
+| ----------- | ----------------------------------------------------- |
+| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
+
+## InMemoryLookupKB.get_entity_strings {#get_entity_strings tag="method"}
+
+Get a list of all entity IDs in the knowledge base.
+
+> #### Example
+>
+> ```python
+> all_entities = kb.get_entity_strings()
+> ```
+
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
+| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
+
+## InMemoryLookupKB.get_size_aliases {#get_size_aliases tag="method"}
+
+Get the total number of aliases in the knowledge base.
+
+> #### Example
+>
+> ```python
+> total_aliases = kb.get_size_aliases()
+> ```
+
+| Name        | Description                                          |
+| ----------- | ---------------------------------------------------- |
+| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
+
+## InMemoryLookupKB.get_alias_strings {#get_alias_strings tag="method"}
+
+Get a list of all aliases in the knowledge base.
+
+> #### Example
+>
+> ```python
+> all_aliases = kb.get_alias_strings()
+> ```
+
+| Name        | Description                                              |
+| ----------- | -------------------------------------------------------- |
+| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
+
+## InMemoryLookupKB.get_candidates {#get_candidates tag="method"}
+
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate). Wraps
+[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+
+> #### Example
+>
+> ```python
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates(doc[0:2])
+> ```
+
+| Name        | Description                                                          |
+| ----------- | -------------------------------------------------------------------- |
+| `mention`   | The textual mention or alias. ~~Span~~                               |
+| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+
+## InMemoryLookupKB.get_candidates_batch {#get_candidates_batch tag="method"}
+
+Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
+will call `get_candidates_batch()` instead of `get_candidates()`, if the config
+parameter `candidates_batch_size` is greater or equal than 1.
+
+The default implementation of `get_candidates_batch()` executes
+`get_candidates()` in a loop. We recommend implementing a more efficient way to
+retrieve candidates for multiple mentions at once, if performance is of concern
+to you.
+
+> #### Example
+>
+> ```python
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> ```
+
+| Name        | Description                                                                                  |
+| ----------- | -------------------------------------------------------------------------------------------- |
+| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
+
+## InMemoryLookupKB.get_alias_candidates {#get_alias_candidates tag="method"}
+
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate).
+
+> #### Example
+>
+> ```python
+> candidates = kb.get_alias_candidates("Douglas")
+> ```
+
+| Name        | Description                                                   |
+| ----------- | ------------------------------------------------------------- |
+| `alias`     | The textual mention or alias. ~~str~~                         |
+| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+
+## InMemoryLookupKB.get_vector {#get_vector tag="method"}
+
+Given a certain entity ID, retrieve its pretrained entity vector.
+
+> #### Example
+>
+> ```python
+> vector = kb.get_vector("Q42")
+> ```
+
+| Name        | Description                          |
+| ----------- | ------------------------------------ |
+| `entity`    | The entity ID. ~~str~~               |
+| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
+
+## InMemoryLookupKB.get_vectors {#get_vectors tag="method"}
+
+Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+number of entity IDs.
+
+The default implementation of `get_vectors()` executes `get_vector()` in a loop.
+We recommend implementing a more efficient way to retrieve vectors for multiple
+entities at once, if performance is of concern to you.
+
+> #### Example
+>
+> ```python
+> vectors = kb.get_vectors(("Q42", "Q3107329"))
+> ```
+
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
+| `entities`  | The entity IDs. ~~Iterable[str]~~                         |
+| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
+
+## InMemoryLookupKB.get_prior_prob {#get_prior_prob tag="method"}
+
+Given a certain entity ID and a certain textual mention, retrieve the prior
+probability of the fact that the mention links to the entity ID.
+
+> #### Example
+>
+> ```python
+> probability = kb.get_prior_prob("Q42", "Douglas")
+> ```
+
+| Name        | Description                                                               |
+| ----------- | ------------------------------------------------------------------------- |
+| `entity`    | The entity ID. ~~str~~                                                    |
+| `alias`     | The textual mention or alias. ~~str~~                                     |
+| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+
+## InMemoryLookupKB.to_disk {#to_disk tag="method"}
+
+Save the current state of the knowledge base to a directory.
+
+> #### Example
+>
+> ```python
+> kb.to_disk(path)
+> ```
+
+| Name      | Description                                                                                                                                |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path`    | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~                                                                                           |
+
+## InMemoryLookupKB.from_disk {#from_disk tag="method"}
+
+Restore the state of the knowledge base from a given directory. Note that the
+[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
+
+> #### Example
+>
+> ```python
+> from spacy.vocab import Vocab
+> vocab = Vocab().from_disk("/path/to/vocab")
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
+> kb.from_disk("/path/to/kb")
+> ```
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `loc`       | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                |
+| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          |
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -70,7 +70,7 @@ lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
 [`token.pos`](/api/token) from a previous pipeline component (see example
 pipeline configurations in the
 [pretrained pipeline design details](/models#design-cnn)) or rely on third-party
-libraries (`pymorphy2`).
+libraries (`pymorphy3`).

 | Language | Default Mode |
 | -------- | ------------ |
@ -86,9 +86,9 @@ libraries (`pymorphy2`).
 | `nb`     | `rule`       |
 | `nl`     | `rule`       |
 | `pl`     | `pos_lookup` |
-| `ru`     | `pymorphy2`  |
+| `ru`     | `pymorphy3`  |
 | `sv`     | `rule`       |
-| `uk`     | `pymorphy2`  |
+| `uk`     | `pymorphy3`  |

 ```python
 %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@ -255,9 +255,10 @@ Return a copy of the span group.
 > new_group = doc.spans["errors"].copy()
 > ```

-| Name        | Description                                     |
-| ----------- | ----------------------------------------------- |
-| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
+| Name        | Description                                                                                        |
+| ----------- | -------------------------------------------------------------------------------------------------- |
+| `doc`       | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
+| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~                                                    |

 ## SpanGroup.to_bytes {#to_bytes tag="method"}

--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@ -78,7 +78,9 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
 | Name                                             | Description                                                                                        |
 | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
 | [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |
-| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                           |
+| [`KnowledgeBase`](/api/kb)                       | Abstract base class for storage and retrieval of data for entity linking.                          |
+| [`InMemoryLookupKB`](/api/kb_in_memory)          | Implementation of `KnowledgeBase` storing all data in memory.                                      |
+| [`Candidate`](/api/kb#candidate)                 | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`.        |
 | [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
 | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
 | [`Morphology`](/api/morphology)                  | Store morphological analyses and map them to and from hash values.                                 |
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -243,6 +243,27 @@ pipelines.
 > python -m spacy project run test . --vars.foo bar
 > ```

+> #### Tip: Environment Variables
+>
+> Commands in a project file are not executed in a shell, so they don't have
+> direct access to environment variables. But you can insert environment
+> variables using the `env` dictionary to make values available for
+> interpolation, just like values in `vars`. Here's an example `env` dict that
+> makes `$PATH` available as `ENV_PATH`:
+>
+> ```yaml
+> env:
+>   ENV_PATH: PATH
+> ```
+>
+> This can be used in a project command like so:
+>
+> ```yaml
+>   - name: "echo-path"
+>     script:
+>       - "echo ${env.ENV_PATH}"
+> ```
+
 | Section                                             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `title`                                             | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -480,7 +480,7 @@ as-is. They are also excluded when calling
 > parse. So the evaluation results should always reflect what your pipeline will
 > produce at runtime. If you want a frozen component to run (without updating)
 > during training as well, so that downstream components can use its
-> **predictions**, you can add it to the list of
+> **predictions**, you should add it to the list of
 > [`annotating_components`](/usage/training#annotating-components).

 ```ini
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -374,8 +374,8 @@
            "has_examples": true,
            "dependencies": [
                {
-                    "name": "pymorphy2",
-                    "url": "https://github.com/kmike/pymorphy2"
+                    "name": "pymorphy3",
+                    "url": "https://github.com/no-plagiarism/pymorphy3"
                }
            ],
            "models": [
@ -480,12 +480,12 @@
            ],
            "dependencies": [
                {
-                    "name": "pymorphy2",
-                    "url": "https://github.com/kmike/pymorphy2"
+                    "name": "pymorphy3",
+                    "url": "https://github.com/no-plagiarism/pymorphy3"
                },
                {
-                    "name": "pymorphy2-dicts-uk",
-                    "url": "https://github.com/kmike/pymorphy2-dicts/"
+                    "name": "pymorphy3-dicts-uk",
+                    "url": "https://github.com/no-plagiarism/pymorphy3-dicts"
                }
            ]
        },
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,46 @@
 {
    "resources": [
+        {
+            "id": "spacy-cleaner",
+            "title": "spacy-cleaner",
+            "slogan": "Easily clean text with spaCy!",
+            "description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n  mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
+            "github": "Ce11an/spacy-cleaner",
+            "pip": "spacy-cleaner",
+            "code_example": [
+                "import spacy",
+                "import spacy_cleaner",
+                "from spacy_cleaner.processing import removers, replacers, mutators",
+                "",
+                "model = spacy.load(\"en_core_web_sm\")",
+                "pipeline = spacy_cleaner.Pipeline(",
+                "    model,",
+                "    removers.remove_stopword_token,",
+                "    replacers.replace_punctuation_token,",
+                "    mutators.mutate_lemma_token,",
+                ")",
+                "",
+                "texts = [\"Hello, my name is Cellan! I love to swim!\"]",
+                "",
+                "pipeline.clean(texts)",
+                "# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
+            ],
+            "code_language": "python",
+            "url": "https://ce11an.github.io/spacy-cleaner/",
+            "image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
+            "author": "Cellan Hall",
+            "author_links": {
+                "twitter": "Ce11an",
+                "github": "Ce11an",
+                "website": "https://www.linkedin.com/in/cellan-hall/"
+            },
+            "category": [
+                "extension"
+            ],
+            "tags": [
+                "text-processing"
+            ]
+        },
        {
            "id": "Zshot",
            "title": "Zshot",
@ -2460,20 +2501,20 @@
                "import spacy",
                "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
                "",
-                "# Load an spacy model (supported models are \"es\" and \"en\") ",
-                "nlp = spacy.load('en')",
-                "# Spacy 3.x",
-                "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
-                "# Spacy 2.x",
+                "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
+                "nlp = spacy.load('en_core_web_sm')",
+                "# spaCy 3.x",
+                "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
+                "# spaCy 2.x",
                "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
                "token = nlp('prices')[0]",
                "",
-                "# wordnet object link spacy token with nltk wordnet interface by giving acces to",
+                "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
                "# synsets and lemmas ",
                "token._.wordnet.synsets()",
                "token._.wordnet.lemmas()",
                "",
-                "# And automatically tags with wordnet domains",
+                "# And automatically add info about WordNet domains",
                "token._.wordnet.wordnet_domains()"
            ],
            "author": "recognai",