Merge remote-tracking branch 'upstream/v4' into feature/lookups-tables-url

2025-09-18 10:02:40 +03:00 · 2023-03-27 09:40:18 +02:00 · 2023-03-27 09:40:18 +02:00 · 41037041f0
commit 41037041f0
parent 0bf99827b1 a653dec654
71 changed files with 1234 additions and 524 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -69,6 +69,11 @@ steps:
 #    displayName: 'Test skip re-download (#12188)'
 #    condition: eq(variables['python_version'], '3.8')
 #  - script: |
 #      python -W error -m spacy info ca_core_news_sm | grep -q download_url
 #    displayName: 'Test download_url in info CLI'
 #    condition: eq(variables['python_version'] '3.8')
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
    displayName: 'Test convert CLI'
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -16,7 +16,7 @@ jobs:
        with:
            ref: ${{ github.head_ref }}
      - uses: actions/setup-python@v4
-      - run: pip install black
+      - run: pip install black -c requirements.txt
      - name: Auto-format code if needed
        run: black spacy
      # We can't run black --check here because that returns a non-zero excit
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
 Python modules. If you've built spaCy from source, you'll already have both
 tools installed.
 As a general rule of thumb, we use f-strings for any formatting of strings.
 One exception are calls to Python's `logging` functionality.
 To avoid unnecessary string conversions in these cases, we use string formatting
 templates with `%s` and `%d` etc.
 **⚠️ Note that formatting and linting is currently only possible for Python
 modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -41,7 +41,7 @@ jobs:
        inputs:
          versionSpec: "3.8"
      - script: |
-          pip install black==22.3.0
+          pip install black -c requirements.txt
          python -m black spacy --check
        displayName: "black"
      - script: |
--- a/requirements.txt
+++ b/requirements.txt
@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
-black>=22.0,<23.0
+black==22.3.0
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -90,9 +90,9 @@ def parse_config_overrides(
    cli_overrides = _parse_overrides(args, is_cli=True)
    if cli_overrides:
        keys = [k for k in cli_overrides if k not in env_overrides]
-        logger.debug(f"Config overrides from CLI: {keys}")
+        logger.debug("Config overrides from CLI: %s", keys)
    if env_overrides:
-        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+        logger.debug("Config overrides from env variables: %s", list(env_overrides))
    return {**cli_overrides, **env_overrides}
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,10 +1,10 @@
 from typing import Optional, Dict, Any, Union, List
 import platform
 import pkg_resources
 import json
 from pathlib import Path
 from wasabi import Printer, MarkdownRenderer
 import srsly
 import importlib.metadata
 from ._util import app, Arg, Opt, string_to_list
 from .download import get_model_filename, get_latest_version
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
    dist-info available.
    """
    try:
-        dist = pkg_resources.get_distribution(model)
+        dist = importlib.metadata.distribution(model)
-        data = json.loads(dist.get_metadata("direct_url.json"))
+        text = dist.read_text("direct_url.json")
-        return data["url"]
+        if isinstance(text, str):
-    except pkg_resources.DistributionNotFound:
+            data = json.loads(text)
-        # no such package
+            return data["url"]
        return None
    except Exception:
-        # something else, like no file or invalid JSON
+        pass
-        return None
+    return None
 def info_model_url(model: str) -> Dict[str, Any]:
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -252,7 +252,7 @@ def get_third_party_dependencies(
                    raise regerr from None
            module_name = func_info.get("module")  # type: ignore[attr-defined]
            if module_name:  # the code is part of a module, not a --code file
-                modules.add(func_info["module"].split(".")[0])  # type: ignore[index]
+                modules.add(func_info["module"].split(".")[0])  # type: ignore[union-attr]
    dependencies = []
    for module_name in modules:
        if module_name in distributions:
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
    # in the list.
    while commands:
        for i, cmd in enumerate(list(commands)):
-            logger.debug(f"CMD: {cmd['name']}.")
+            logger.debug("CMD: %s.", cmd["name"])
            deps = [project_dir / dep for dep in cmd.get("deps", [])]
            if all(dep.exists() for dep in deps):
                cmd_hash = get_command_hash("", "", deps, cmd["script"])
                for output_path in cmd.get("outputs", []):
                    url = storage.pull(output_path, command_hash=cmd_hash)
                    logger.debug(
-                        f"URL: {url} for {output_path} with command hash {cmd_hash}"
+                        "URL: %s for %s with command hash %s",
                        url,
                        output_path,
                        cmd_hash,
                    )
                    yield url, output_path
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                commands.pop(i)
                break
            else:
-                logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
+                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
        else:
            # If we didn't break the for loop, break the while loop.
            break
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
        remote = config["remotes"][remote]
    storage = RemoteStorage(project_dir, remote)
    for cmd in config.get("commands", []):
-        logger.debug(f"CMD: cmd['name']")
+        logger.debug("CMD: %s", cmd["name"])
        deps = [project_dir / dep for dep in cmd.get("deps", [])]
        if any(not dep.exists() for dep in deps):
-            logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
+            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
            continue
        cmd_hash = get_command_hash(
            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
        )
-        logger.debug(f"CMD_HASH: {cmd_hash}")
+        logger.debug("CMD_HASH: %s", cmd_hash)
        for output_path in cmd.get("outputs", []):
            output_loc = project_dir / output_path
            if output_loc.exists() and _is_not_empty_dir(output_loc):
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
                    content_hash=get_content_hash(output_loc),
                )
                logger.debug(
-                    f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
+                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
                )
                yield output_path, url
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
 import os.path
 from pathlib import Path
 import pkg_resources
 from wasabi import msg
 from wasabi.util import locale_escape
 import sys
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
        exist.
    """
    import pkg_resources
    failed_pkgs_msgs: List[str] = []
    conflicting_pkgs_msgs: List[str] = []
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -84,7 +84,7 @@ class Warnings(metaclass=ErrorsWithCodes):
            "ignoring the duplicate entry.")
    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
            "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
-    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+    W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
            "the Knowledge Base.")
    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
            "you are constructing a parse tree incrementally by setting "
@ -212,7 +212,11 @@ class Warnings(metaclass=ErrorsWithCodes):
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
    # v4 warning strings
    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
            "to return `True` in `.supports_prior_probs`.")
 class Errors(metaclass=ErrorsWithCodes):
@ -440,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
            "exceed 1, but found {sum}.")
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
-    E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
+    E139 = ("Knowledge base for component '{name}' is empty.")
            "`kb.add_entity` and `kb.add_alias` to add entries.")
    E140 = ("The list of entities, prior probabilities and entity vectors "
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
@ -954,7 +957,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
             "with `displacy.serve(doc, port=port)`")
    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
-             "or use `auto_switch_port=True` to pick an available port automatically.")
+             "or use `auto_select_port=True` to pick an available port automatically.")
    # v4 error strings
    E4000 = ("Expected a Doc as input, but got: '{type}'")
@ -964,7 +967,9 @@ class Errors(metaclass=ErrorsWithCodes):
    E4003 = ("Training examples for distillation must have the exact same tokens in the "
             "reference and predicted docs.")
    E4004 = ("Backprop is not supported when is_train is not set.")
-    E4005 = ("Required lemmatizer table(s) {missing_tables} not found in "
+    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
    E4007 = ("Required lemmatizer table(s) {missing_tables} not found in "
             "[initialize] or in registered lookups (spacy-lookups-data). An "
             "example for how to load lemmatizer tables in [initialize]:\n\n"
             "[initialize.components]\n\n"
@ -975,7 +980,8 @@ class Errors(metaclass=ErrorsWithCodes):
             f'url = "{about.__lookups_url__}"\n'
             "tables = {tables}\n"
             "# or required tables only: tables = {required_tables}\n")
-    E4006 = ("Server error ({status_code}), couldn't fetch {url}")
+    E4008 = ("Server error ({status_code}), couldn't fetch {url}")
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -1,3 +1,5 @@
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, get_candidates, get_candidates_batch
+from .candidate import Candidate, InMemoryCandidate
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -1,12 +1,15 @@
 from .kb cimport KnowledgeBase
 from libcpp.vector cimport vector
 from .kb_in_memory cimport InMemoryLookupKB
 from ..typedefs cimport hash_t
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
-    cdef readonly KnowledgeBase kb
+    pass
-    cdef hash_t entity_hash
+
-    cdef float entity_freq
+
-    cdef vector[float] entity_vector
+cdef class InMemoryCandidate(Candidate):
-    cdef hash_t alias_hash
+    cdef readonly hash_t _entity_hash
-    cdef float prior_prob
+    cdef readonly hash_t _alias_hash
    cpdef vector[float] _entity_vector
    cdef float _prior_prob
    cdef readonly InMemoryLookupKB _kb
    cdef float _entity_freq
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -1,74 +1,96 @@
 # cython: infer_types=True, profile=True
-from typing import Iterable
+from .kb_in_memory cimport InMemoryLookupKB
-from .kb cimport KnowledgeBase
+from ..errors import Errors
 from ..tokens import Span
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
+    """A `Candidate` object refers to a textual mention that may or may not be resolved
-    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
+    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
    algorithm which will disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned a certain prior probability.
+    Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
    is assigned a certain prior probability.
    DOCS: https://spacy.io/api/kb/#candidate-init
    """
-    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+    def __init__(self):
-        self.kb = kb
+        # Make sure abstract Candidate is not instantiated.
-        self.entity_hash = entity_hash
+        if self.__class__ == Candidate:
-        self.entity_freq = entity_freq
+            raise TypeError(
-        self.entity_vector = entity_vector
+                Errors.E1046.format(cls_name=self.__class__.__name__)
-        self.alias_hash = alias_hash
+            )
        self.prior_prob = prior_prob
    @property
-    def entity(self) -> int:
+    def entity_id(self) -> int:
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
-        return self.entity_hash
+        otherwise the hash of the entity ID string)."""
        raise NotImplementedError
    @property
-    def entity_(self) -> str:
+    def entity_id_(self) -> str:
-        """RETURNS (str): ID/name of this entity in the KB"""
+        """RETURNS (str): String representation of entity ID."""
-        return self.kb.vocab.strings[self.entity_hash]
+        raise NotImplementedError
    @property
-    def alias(self) -> int:
+    def entity_vector(self) -> vector[float]:
-        """RETURNS (uint64): hash of the alias"""
+        """RETURNS (vector[float]): Entity vector."""
-        return self.alias_hash
+        raise NotImplementedError
 cdef class InMemoryCandidate(Candidate):
    """Candidate for InMemoryLookupKB."""
    def __init__(
        self,
        kb: InMemoryLookupKB,
        entity_hash: int,
        alias_hash: int,
        entity_vector: vector[float],
        prior_prob: float,
        entity_freq: float
    ):
        """
        kb (InMemoryLookupKB]): InMemoryLookupKB instance.
        entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
        entity_freq (int): Entity frequency in KB corpus.
        entity_vector (List[float]): Entity embedding.
        alias_hash (int): Alias hash.
        prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
            the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
        """
        super().__init__()
        self._entity_hash = entity_hash
        self._entity_vector = entity_vector
        self._prior_prob = prior_prob
        self._kb = kb
        self._alias_hash = alias_hash
        self._entity_freq = entity_freq
    @property
-    def alias_(self) -> str:
+    def entity_id(self) -> int:
-        """RETURNS (str): ID of the original alias"""
+        return self._entity_hash
        return self.kb.vocab.strings[self.alias_hash]
    @property
-    def entity_freq(self) -> float:
+    def entity_vector(self) -> vector[float]:
-        return self.entity_freq
+        return self._entity_vector
    @property
    def entity_vector(self) -> Iterable[float]:
        return self.entity_vector
    @property
    def prior_prob(self) -> float:
-        return self.prior_prob
+        """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
        this entity."""
        return self._prior_prob
    @property
    def alias(self) -> str:
        """RETURNS (str): Alias."""
        return self._kb.vocab.strings[self._alias_hash]
-def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    @property
-    """
+    def entity_id_(self) -> str:
-    Return candidate entities for a given mention and fetching appropriate entries from the index.
+        return self._kb.vocab.strings[self._entity_hash]
    kb (KnowledgeBase): Knowledge base to query.
    mention (Span): Entity mention for which to identify candidates.
    RETURNS (Iterable[Candidate]): Identified candidates.
    """
    return kb.get_candidates(mention)
-
+    @property
-def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+    def entity_freq(self) -> float:
-    """
+        """RETURNS (float): Entity frequency in KB corpus."""
-    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+        return self._entity_freq
    kb (KnowledgeBase): Knowledge base to query.
    mention (Iterable[Span]): Entity mentions for which to identify candidates.
    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
    """
    return kb.get_candidates_batch(mentions)
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
 from cymem.cymem cimport Pool
 from .candidate import Candidate
-from ..tokens import Span
+from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from ..errors import Errors
@ -30,21 +30,23 @@ cdef class KnowledgeBase:
        self.entity_vector_length = entity_vector_length
        self.mem = Pool()
-    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+    def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
        """
-        Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
+        Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
-        and the prior probability of that alias resolving to that entity.
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
-        If no candidate is found for a given text, an empty list is returned.
+        probability of the specified mention text resolving to that entity - might be included.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
+        If no candidates are found for a given mention, an empty list is returned.
        mentions (SpanGroup): Mentions for which to get candidates.
        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
        """
        return [self.get_candidates(span) for span in mentions]
    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
        """
-        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+        Return candidate entities for a specific mention. Each candidate defines at least the entity and the
-        and the prior probability of that alias resolving to that entity.
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
-        If the no candidate is found for a given text, an empty list is returned.
+        probability of the specified mention text resolving to that entity - might be included.
        If no candidate is found for the given mention, an empty list is returned.
        mention (Span): Mention for which to get candidates.
        RETURNS (Iterable[Candidate]): Identified candidates.
        """
@ -106,3 +108,10 @@ cdef class KnowledgeBase:
        raise NotImplementedError(
            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
        )
    @property
    def supports_prior_probs(self) -> bool:
        """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
        raise NotImplementedError(
            Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
        )
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -18,7 +18,7 @@ from .. import util
 from ..util import SimpleFrozenList, ensure_path
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
-from .candidate import Candidate as Candidate
+from .candidate import InMemoryCandidate
 cdef class InMemoryLookupKB(KnowledgeBase):
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        self._alias_index = PreshMap(nr_aliases + 1)
        self._aliases_table = alias_vec(nr_aliases + 1)
    def is_empty(self):
        return len(self) == 0
    def __len__(self):
        return self.get_size_entities()
@ -223,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry
-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+    def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+        return self._get_alias_candidates(mention.text)  # type: ignore
-    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
@ -238,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
-        return [Candidate(kb=self,
+        return [
-                          entity_hash=self._entries[entry_index].entity_hash,
+            InMemoryCandidate(
-                          entity_freq=self._entries[entry_index].freq,
+                kb=self,
-                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                entity_hash=self._entries[entry_index].entity_hash,
-                          alias_hash=alias_hash,
+                alias_hash=alias_hash,
-                          prior_prob=prior_prob)
+                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
-                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                prior_prob=prior_prob,
-                if entry_index != 0]
+                entity_freq=self._entries[entry_index].freq
            )
            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
            if entry_index != 0
        ]
    def get_vector(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings[entity]
@ -276,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        return 0.0
    def supports_prior_probs(self) -> bool:
        return True
    def to_bytes(self, **kwargs):
        """Serialize the current state to a binary string.
        """
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language, BaseDefaults
 from ...pipeline import Lemmatizer
-
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 # Punctuation stolen from Danish
 from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 class SwedishDefaults(BaseDefaults):
--- a/spacy/lang/sv/punctuation.py
+++ b/spacy/lang/sv/punctuation.py
@ -0,0 +1,33 @@
 from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 from ..punctuation import TOKENIZER_SUFFIXES
 _quotes = CONCAT_QUOTES.replace("'", "")
 _infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
    ]
 )
 _suffixes = [
    suffix
    for suffix in TOKENIZER_SUFFIXES
    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
 ]
 _suffixes += [r"(?<=[^sSxXzZ])\'"]
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/language.py
+++ b/spacy/language.py
@ -2065,7 +2065,7 @@ class Language:
        pipe = self.get_pipe(pipe_name)
        pipe_cfg = self._pipe_configs[pipe_name]
        if listeners:
-            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
+            util.logger.debug("Replacing listeners of component '%s'", pipe_name)
            if len(list(listeners)) != len(pipe_listeners):
                # The number of listeners defined in the component model doesn't
                # match the listeners to replace, so we won't be able to update
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -30,7 +30,7 @@ def load_lookups_data_from_url(lang, tables, url):
        r = requests.get(table_url)
        if r.status_code != 200:
            raise ValueError(
-                Errors.E4006.format(status_code=r.status_code, url=table_url)
+                Errors.E4008.format(status_code=r.status_code, url=table_url)
            )
        table_data = r.json()
        lookups.add_table(table, table_data)
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
            "$-": self._imm_left_sib,
            "$++": self._right_sib,
            "$--": self._left_sib,
            ">+": self._imm_right_child,
            ">-": self._imm_left_child,
            ">++": self._right_child,
            ">--": self._left_child,
            "<+": self._imm_right_parent,
            "<-": self._imm_left_parent,
            "<++": self._right_parent,
            "<--": self._left_parent,
        }
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
    def _left_sib(self, doc, node):
        return [doc[child.i] for child in doc[node].head.children if child.i < node]
    def _imm_right_child(self, doc, node):
        for child in doc[node].children:
            if child.i == node + 1:
                return [doc[child.i]]
        return []
    def _imm_left_child(self, doc, node):
        for child in doc[node].children:
            if child.i == node - 1:
                return [doc[child.i]]
        return []
    def _right_child(self, doc, node):
        return [doc[child.i] for child in doc[node].children if child.i > node]
    def _left_child(self, doc, node):
        return [doc[child.i] for child in doc[node].children if child.i < node]
    def _imm_right_parent(self, doc, node):
        if doc[node].head.i == node + 1:
            return [doc[node].head]
        return []
    def _imm_left_parent(self, doc, node):
        if doc[node].head.i == node - 1:
            return [doc[node].head]
        return []
    def _right_parent(self, doc, node):
        if doc[node].head.i > node:
            return [doc[node].head]
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
    return attr_values
 def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
    # tuple order affects performance
    return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
 # These predicate helper classes are used to match the REGEX, IN, >= etc
 # extensions to the matcher introduced in #3173.
@ -848,7 +853,7 @@ class _FuzzyPredicate:
        fuzz = self.predicate[len("FUZZY"):] # number after prefix
        self.fuzzy = int(fuzz) if fuzz else -1
        self.fuzzy_compare = fuzzy_compare
-        self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
    def __call__(self, Token token):
        if self.is_extension:
@ -870,7 +875,7 @@ class _RegexPredicate:
        self.value = re.compile(value)
        self.predicate = predicate
        self.is_extension = is_extension
-        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value)
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -906,7 +911,7 @@ class _SetPredicate:
                self.value = set(get_string_id(v) for v in value)
        self.predicate = predicate
        self.is_extension = is_extension
-        self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -978,7 +983,7 @@ class _ComparisonPredicate:
        self.value = value
        self.predicate = predicate
        self.is_extension = is_extension
-        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value)
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
        if isinstance(value, dict):
            for type_, cls in predicate_types.items():
                if type_ in value:
-                    key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
+                    key = _predicate_cache_key(attr, type_, value[type_])
                    if key in seen_predicates:
                        output.append(seen_predicates[key])
                    else:
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -6,9 +6,9 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged
 from ...util import registry
 from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate, get_candidates, get_candidates_batch
+from ...kb import Candidate
 from ...vocab import Vocab
-from ...tokens import Span, Doc
+from ...tokens import Doc, Span, SpanGroup
 from ..extract_spans import extract_spans
 from ...errors import Errors
@ -89,6 +89,14 @@ def load_kb(
    return kb_from_file
@registry.misc("spacy.EmptyKB.v2")
 def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
    def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
        return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
    return empty_kb_factory
@registry.misc("spacy.EmptyKB.v1")
 def empty_kb(
    entity_vector_length: int,
@ -106,6 +114,28 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
@registry.misc("spacy.CandidateBatchGenerator.v1")
 def create_candidates_batch() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
 ]:
    return get_candidates_batch
 def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
    """
    Return candidate entities for a given mention and fetching appropriate entries from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Span): Entity mention for which to identify candidates.
    RETURNS (Iterable[Candidate]): Identified candidates.
    """
    return kb.get_candidates(mention)
 def get_candidates_batch(
    kb: KnowledgeBase, mentions: SpanGroup
 ) -> Iterable[Iterable[Candidate]]:
    """
    Return candidate entities for the given mentions and fetching appropriate entries from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mentions (SpanGroup): Entity mentions for which to identify candidates.
    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
    """
    return kb.get_candidates_batch(mentions)
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
    cdef np.ndarray step_actions
    scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
        step_actions = actions[0] if actions is not None else None
        assert step_actions is None or step_actions.size == sizes.states, \
            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
        with nogil:
            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
            if actions is None:
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,5 +1,5 @@
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+import warnings
-from typing import cast
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
 from numpy import dtype
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from pathlib import Path
@ -10,14 +10,15 @@ from thinc.api import CosineDistance, Model, Optimizer, Config
 from thinc.api import set_dropout_rate
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
 from ..tokens import Doc, Span
 from ..ml import empty_kb
 from ..tokens import Doc, Span, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer
@ -58,6 +59,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
        "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
        "overwrite": False,
        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
        "candidates_batch_size": 1,
@ -82,8 +84,9 @@ def make_entity_linker(
    entity_vector_length: int,
    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
    get_candidates_batch: Callable[
-        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
    ],
    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
    overwrite: bool,
    scorer: Optional[Callable],
    use_gold_ents: bool,
@ -104,8 +107,9 @@ def make_entity_linker(
    get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
        produces a list of candidates, given a certain knowledge base and a textual mention.
    get_candidates_batch (
-        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
        ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
    scorer (Optional[Callable]): The scoring method.
    use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
        component must provide entity annotations.
@ -114,28 +118,9 @@ def make_entity_linker(
        prediction is discarded. If None, predictions are not filtered by any threshold.
    save_activations (bool): save model activations in Doc when annotating.
    """
    if not model.attrs.get("include_span_maker", False):
-        try:
+        raise ValueError(Errors.E4005)
-            from spacy_legacy.components.entity_linker import EntityLinker_v1
+
        except:
            raise ImportError(
                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
            )
        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
        return EntityLinker_v1(
            nlp.vocab,
            model,
            name,
            labels_discard=labels_discard,
            n_sents=n_sents,
            incl_prior=incl_prior,
            incl_context=incl_context,
            entity_vector_length=entity_vector_length,
            get_candidates=get_candidates,
            overwrite=overwrite,
            scorer=scorer,
        )
    return EntityLinker(
        nlp.vocab,
        model,
@ -147,6 +132,7 @@ def make_entity_linker(
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
        get_candidates_batch=get_candidates_batch,
        generate_empty_kb=generate_empty_kb,
        overwrite=overwrite,
        scorer=scorer,
        use_gold_ents=use_gold_ents,
@ -186,8 +172,9 @@ class EntityLinker(TrainablePipe):
        entity_vector_length: int,
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
        get_candidates_batch: Callable[
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+            [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
        ],
        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
        overwrite: bool = False,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
@ -209,9 +196,10 @@ class EntityLinker(TrainablePipe):
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        get_candidates_batch (
-            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+            Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
            Iterable[Candidate]]
            ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
        overwrite (bool): Whether to overwrite existing non-empty annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
@ -219,6 +207,7 @@ class EntityLinker(TrainablePipe):
        candidates_batch_size (int): Size of batches for entity candidate generation.
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/entitylinker#init
        """
@ -235,6 +224,7 @@ class EntityLinker(TrainablePipe):
        self.model = model
        self.name = name
        self.labels_discard = list(labels_discard)
        # how many neighbour sentences to take into account
        self.n_sents = n_sents
        self.incl_prior = incl_prior
        self.incl_context = incl_context
@ -242,9 +232,7 @@ class EntityLinker(TrainablePipe):
        self.get_candidates_batch = get_candidates_batch
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
+        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
        # create an empty KB by default
        self.kb = empty_kb(entity_vector_length)(self.vocab)
        self.scorer = scorer
        self.use_gold_ents = use_gold_ents
        self.candidates_batch_size = candidates_batch_size
@ -253,6 +241,8 @@ class EntityLinker(TrainablePipe):
        if candidates_batch_size < 1:
            raise ValueError(Errors.E1044)
        if self.incl_prior and not self.kb.supports_prior_probs:
            warnings.warn(Warnings.W401)
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
@ -266,7 +256,7 @@ class EntityLinker(TrainablePipe):
        # Raise an error if the knowledge base is not initialized.
        if self.kb is None:
            raise ValueError(Errors.E1018.format(name=self.name))
-        if len(self.kb) == 0:
+        if hasattr(self.kb, "is_empty") and self.kb.is_empty():
            raise ValueError(Errors.E139.format(name=self.name))
    def initialize(
@ -485,7 +475,8 @@ class EntityLinker(TrainablePipe):
                batch_candidates = list(
                    self.get_candidates_batch(
-                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
+                        self.kb,
                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
                    )
                    if self.candidates_batch_size > 1
                    else [
@ -535,18 +526,19 @@ class EntityLinker(TrainablePipe):
                            )
                        elif len(candidates) == 1 and self.threshold is None:
                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
+                            final_kb_ids.append(candidates[0].entity_id_)
                            self._add_activations(
                                doc_scores=doc_scores,
                                doc_ents=doc_ents,
                                scores=[1.0],
-                                ents=[candidates[0].entity_],
+                                ents=[candidates[0].entity_id],
                            )
                        else:
                            random.shuffle(candidates)
                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                            if self.incl_prior and self.kb.supports_prior_probs:
-                            if not self.incl_prior:
+                                prior_probs = xp.asarray([c.prior_prob for c in candidates])  # type: ignore
                            else:
                                prior_probs = xp.asarray([0.0 for _ in candidates])
                            scores = prior_probs
                            # add in similarity from the context
@ -570,7 +562,7 @@ class EntityLinker(TrainablePipe):
                                    raise ValueError(Errors.E161)
                                scores = prior_probs + sims - (prior_probs * sims)
                            final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_
+                                candidates[scores.argmax().item()].entity_id_
                                if self.threshold is None
                                or scores.max() >= self.threshold
                                else EntityLinker.NIL
@ -579,7 +571,7 @@ class EntityLinker(TrainablePipe):
                                doc_scores=doc_scores,
                                doc_ents=doc_ents,
                                scores=scores,
-                                ents=[c.entity for c in candidates],
+                                ents=[c.entity_id for c in candidates],
                            )
            self._add_doc_activations(
                docs_scores=docs_scores,
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -167,7 +167,7 @@ class Lemmatizer(Pipe):
            missing_tables = set(required_tables) - set(lookups.tables)
            if len(missing_tables) > 0:
                raise ValueError(
-                    Errors.E4005.format(
+                    Errors.E4007.format(
                        missing_tables=list(missing_tables),
                        pipe_name=self.name,
                        required_tables=srsly.json_dumps(required_tables),
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,5 +1,6 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any
+from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from thinc.types import Floats2d
 from itertools import islice
 from .trainable_pipe import TrainablePipe
@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe):
        DOCS: https://spacy.io/api/tok2vec#update
        """
        if losses is None:
            losses = {}
        validate_examples(examples, "Tok2Vec.update")
        docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
+        return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
        tokvecs, bp_tokvecs = self.model.begin_update(docs)
        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        losses.setdefault(self.name, 0.0)
        def accumulate_gradient(one_d_tokvecs):
            """Accumulate tok2vec loss and gradient. This is passed as a callback
            to all but the last listener. Only the last one does the backprop.
            """
            nonlocal d_tokvecs
            for i in range(len(one_d_tokvecs)):
                d_tokvecs[i] += one_d_tokvecs[i]
                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        def backprop(one_d_tokvecs):
            """Callback to actually do the backprop. Passed to last listener."""
            accumulate_gradient(one_d_tokvecs)
            d_docs = bp_tokvecs(d_tokvecs)
            if sgd is not None:
                self.finish_update(sgd)
            return d_docs
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners[:-1]:
            listener.receive(batch_id, tokvecs, accumulate_gradient)
        if self.listeners:
            self.listeners[-1].receive(batch_id, tokvecs, backprop)
        return losses
    def get_loss(self, examples, scores) -> None:
        pass
@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe):
    def add_label(self, label):
        raise NotImplementedError
    def distill(
        self,
        teacher_pipe: Optional["TrainablePipe"],
        examples: Iterable["Example"],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        """Performs an update of the student pipe's model using the
        student's distillation examples and sets the annotations
        of the teacher's distillation examples using the teacher pipe.
        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
            for prediction.
        examples (Iterable[Example]): Distillation examples. The reference (teacher)
            and predicted (student) docs must have the same number of tokens and the
            same orthography.
        drop (float): dropout rate.
        sgd (Optional[Optimizer]): An optimizer. Will be created via
            create_optimizer if not set.
        losses (Optional[Dict[str, float]]): Optional record of loss during
            distillation.
        RETURNS: The updated losses dictionary.
        DOCS: https://spacy.io/api/tok2vec#distill
        """
        # By default we require a teacher pipe, but there are downstream
        # implementations that don't require a pipe.
        if teacher_pipe is None:
            raise ValueError(Errors.E4002.format(name=self.name))
        teacher_docs = [eg.reference for eg in examples]
        student_docs = [eg.predicted for eg in examples]
        teacher_preds = teacher_pipe.predict(teacher_docs)
        teacher_pipe.set_annotations(teacher_docs, teacher_preds)
        return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
    def _update_with_docs(
        self,
        docs: Iterable[Doc],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ):
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        set_dropout_rate(self.model, drop)
        tokvecs, accumulate_gradient, backprop = self._create_backprops(
            docs, losses, sgd=sgd
        )
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners[:-1]:
            listener.receive(batch_id, tokvecs, accumulate_gradient)
        if self.listeners:
            self.listeners[-1].receive(batch_id, tokvecs, backprop)
        return losses
    def _create_backprops(
        self,
        docs: Iterable[Doc],
        losses: Dict[str, float],
        *,
        sgd: Optional[Optimizer] = None,
    ) -> Tuple[Floats2d, Callable, Callable]:
        tokvecs, bp_tokvecs = self.model.begin_update(docs)
        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        def accumulate_gradient(one_d_tokvecs):
            """Accumulate tok2vec loss and gradient. This is passed as a callback
            to all but the last listener. Only the last one does the backprop.
            """
            nonlocal d_tokvecs
            for i in range(len(one_d_tokvecs)):
                d_tokvecs[i] += one_d_tokvecs[i]
                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
        def backprop(one_d_tokvecs):
            """Callback to actually do the backprop. Passed to last listener."""
            accumulate_gradient(one_d_tokvecs)
            d_docs = bp_tokvecs(d_tokvecs)
            if sgd is not None:
                self.finish_update(sgd)
            return d_docs
        return tokvecs, accumulate_gradient, backprop
 class Tok2VecListener(Model):
    """A layer that gets fed its answers from an upstream connection,
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -36,6 +36,11 @@ from ..errors import Errors, Warnings
 from .. import util
 # TODO: Remove when we switch to Cython 3.
 cdef extern from "<algorithm>" namespace "std" nogil:
    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 NUMPY_OPS = NumpyOps()
@ -253,8 +258,8 @@ class Parser(TrainablePipe):
            # batch uniform length. Since we do not have a gold standard
            # sequence, we use the teacher's predictions as the gold
            # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
        else:
            states = self.moves.init_batch(student_docs)
@ -265,12 +270,12 @@ class Parser(TrainablePipe):
        # gradients of the student's transition distributions relative to the
        # teacher's distributions.
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+        student_inputs = TransitionModelInputs(docs=student_docs,
-            max_moves=max_moves)
+            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_diff_to_actions(states, student_states)
        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
+            states=states, moves=teacher_pipe.moves, actions=actions)
        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@ -522,7 +527,7 @@ class Parser(TrainablePipe):
        set_dropout_rate(self.model, 0.0)
        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_to_actions(student_states)
        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
@ -642,7 +647,7 @@ class Parser(TrainablePipe):
                    raise ValueError(Errors.E149) from None
        return self
-    def _init_batch(self, teacher_step_model, docs, max_length):
+    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
        """Make a square batch of length equal to the shortest transition
        sequence or a cap. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
@ -651,10 +656,12 @@ class Parser(TrainablePipe):
        _init_gold_batch, this version uses a teacher model to generate the
        cut sequences."""
        cdef:
            StateClass start_state
            StateClass state
-            Transition action
+            TransitionSystem moves = teacher_pipe.moves
-        all_states = self.moves.init_batch(docs)
+
        # Start with the same heuristic as in supervised training: exclude
        # docs that are within the maximum length.
        all_states = moves.init_batch(docs)
        states = []
        to_cut = []
        for state, doc in zip(all_states, docs):
@ -663,18 +670,28 @@ class Parser(TrainablePipe):
                    states.append(state)
                else:
                    to_cut.append(state)
        if not to_cut:
            return states
        # Parse the states that are too long with the teacher's parsing model.
        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
            states=[state.copy() for state in to_cut])
        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
        # Step through the teacher's actions and store every state after
        # each multiple of max_length.
        teacher_actions = _states_to_actions(teacher_states)
        while to_cut:
            states.extend(state.copy() for state in to_cut)
-            # Move states forward max_length actions.
+            for step_actions in teacher_actions[:max_length]:
-            length = 0
+                to_cut = moves.apply_actions(to_cut, step_actions)
-            while to_cut and length < max_length:
+            teacher_actions = teacher_actions[max_length:]
                teacher_scores = teacher_step_model.predict(to_cut)
                self.transition_states(to_cut, teacher_scores)
                # States that are completed do not need further cutting.
                to_cut = [state for state in to_cut if not state.is_final()]
                length += 1
        return states
            if len(teacher_actions) < max_length:
                break
        return states
    def _init_gold_batch(self, examples, max_length):
        """Make a square batch, of length equal to the shortest transition
@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs):
            model.attrs[key] = value
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
+def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
    cdef int step
    cdef StateClass state
    cdef StateC* c_state
@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
        actions.append(numpy.array(step_actions, dtype="i"))
    return actions
 def _states_diff_to_actions(
    before_states: List[StateClass],
    after_states: List[StateClass]
 ) -> List[Ints1d]:
    """
    Return for two sets of states the actions to go from the first set of
    states to the second set of states. The histories of the first set of
    states must be a prefix of the second set of states.
    """
    cdef StateClass before_state, after_state
    cdef StateC* c_state_before
    cdef StateC* c_state_after
    assert len(before_states) == len(after_states)
    # Check invariant: before states histories must be prefixes of after states.
    for before_state, after_state in zip(before_states, after_states):
        c_state_before = before_state.c
        c_state_after = after_state.c
        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
            c_state_after.history.begin())
    actions = []
    while True:
        step = len(actions)
        step_actions = []
        for before_state, after_state in zip(before_states, after_states):
            c_state_before = before_state.c
            c_state_after = after_state.c
            if step < c_state_after.history.size() - c_state_before.history.size():
                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
        # We are done if we have exhausted all histories.
        if len(step_actions) == 0:
            break
        actions.append(numpy.array(step_actions, dtype="i"))
    return actions
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo
 from pathlib import Path
 class StringStore:
-    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
    @overload
    def __getitem__(self, string_or_hash: str) -> int: ...
    @overload
--- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
 def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
    tokens = sv_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.issue(12311)
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
 def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
    tokens = sv_tokenizer(text)
    assert len(tokens) == 1
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
        ("the", "brown", "$--", 0),
        ("brown", "the", "$--", 1),
        ("brown", "brown", "$--", 0),
        ("over", "jumped", "<+", 0),
        ("quick", "fox", "<+", 0),
        ("the", "quick", "<+", 0),
        ("brown", "fox", "<+", 1),
        ("quick", "fox", "<++", 1),
        ("quick", "over", "<++", 0),
        ("over", "jumped", "<++", 0),
        ("the", "fox", "<++", 2),
        ("brown", "fox", "<-", 0),
        ("fox", "over", "<-", 0),
        ("the", "over", "<-", 0),
        ("over", "jumped", "<-", 1),
        ("brown", "fox", "<--", 0),
        ("fox", "jumped", "<--", 0),
        ("fox", "over", "<--", 1),
        ("fox", "brown", ">+", 0),
        ("over", "fox", ">+", 0),
        ("over", "the", ">+", 0),
        ("jumped", "over", ">+", 1),
        ("jumped", "over", ">++", 1),
        ("fox", "lazy", ">++", 0),
        ("over", "the", ">++", 0),
        ("jumped", "over", ">-", 0),
        ("fox", "quick", ">-", 0),
        ("brown", "quick", ">-", 0),
        ("fox", "brown", ">-", 1),
        ("brown", "fox", ">--", 0),
        ("fox", "brown", ">--", 1),
        ("jumped", "fox", ">--", 1),
--- a/spacy/tests/parser/test_model.py
+++ b/spacy/tests/parser/test_model.py
@ -0,0 +1,61 @@
 import numpy
 import pytest
 from spacy.lang.en import English
 from spacy.ml.tb_framework import TransitionModelInputs
 from spacy.training import Example
 TRAIN_DATA = [
    (
        "They trade mortgage-backed securities.",
        {
            "heads": [1, 1, 4, 4, 5, 1, 1],
            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
        },
    ),
    (
        "I like London and Berlin.",
        {
            "heads": [1, 1, 1, 2, 2, 1],
            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
        },
    ),
 ]
@pytest.fixture
 def nlp_parser():
    nlp = English()
    parser = nlp.add_pipe("parser")
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations["deps"]:
            parser.add_label(dep)
    nlp.initialize()
    return nlp, parser
 def test_incorrect_number_of_actions(nlp_parser):
    nlp, parser = nlp_parser
    doc = nlp.make_doc("test")
    # Too many actions for the number of docs
    with pytest.raises(AssertionError):
        parser.model.predict(
            TransitionModelInputs(
                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
            )
        )
    # Too few actions for the number of docs
    with pytest.raises(AssertionError):
        parser.model.predict(
            TransitionModelInputs(
                docs=[doc, doc],
                moves=parser.moves,
                actions=[numpy.array([0], dtype="i")],
            )
        )
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -623,7 +623,9 @@ def test_is_distillable():
    assert ner.is_distillable
-def test_distill():
+@pytest.mark.slow
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
 def test_distill(max_moves):
    teacher = English()
    teacher_ner = teacher.add_pipe("ner")
    train_examples = []
@ -641,6 +643,7 @@ def test_distill():
    student = English()
    student_ner = student.add_pipe("ner")
    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
    student_ner.initialize(
        get_examples=lambda: train_examples, labels=teacher_ner.label_data
    )
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -463,7 +463,9 @@ def test_is_distillable():
    assert parser.is_distillable
-def test_distill():
+@pytest.mark.slow
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
 def test_distill(max_moves):
    teacher = English()
    teacher_parser = teacher.add_pipe("parser")
    train_examples = []
@ -481,6 +483,7 @@ def test_distill():
    student = English()
    student_parser = student.add_pipe("parser")
    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
    student_parser.initialize(
        get_examples=lambda: train_examples, labels=teacher_parser.label_data
    )
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -7,10 +7,10 @@ from thinc.types import Ragged
 from spacy import registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker
+from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@ -353,6 +353,9 @@ def test_kb_default(nlp):
    """Test that the default (empty) KB is loaded upon construction"""
    entity_linker = nlp.add_pipe("entity_linker", config={})
    assert len(entity_linker.kb) == 0
    with pytest.raises(ValueError, match="E139"):
        # this raises an error because the KB is empty
        entity_linker.validate_kb()
    assert entity_linker.kb.get_size_entities() == 0
    assert entity_linker.kb.get_size_aliases() == 0
    # 64 is the default value from pipeline.entity_linker
@ -462,16 +465,17 @@ def test_candidate_generation(nlp):
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
    # test the size of the relevant candidates
    adam_ent_cands = get_candidates(mykb, adam_ent)
    assert len(get_candidates(mykb, douglas_ent)) == 2
-    assert len(get_candidates(mykb, adam_ent)) == 1
+    assert len(adam_ent_cands) == 1
    assert len(get_candidates(mykb, Adam_ent)) == 0  # default case sensitive
    assert len(get_candidates(mykb, shrubbery_ent)) == 0
    # test the content of the candidates
-    assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
+    assert adam_ent_cands[0].entity_id_ == "Q2"
-    assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
+    assert adam_ent_cands[0].alias == "adam"
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
+    assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
+    assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
 def test_el_pipe_configuration(nlp):
@ -499,7 +503,7 @@ def test_el_pipe_configuration(nlp):
    assert doc[2].ent_kb_id_ == "Q2"
    def get_lowercased_candidates(kb, span):
-        return kb.get_alias_candidates(span.text.lower())
+        return kb._get_alias_candidates(span.text.lower())
    def get_lowercased_candidates_batch(kb, spans):
        return [get_lowercased_candidates(kb, span) for span in spans]
@ -558,24 +562,22 @@ def test_vocab_serialization(nlp):
    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
    adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
-    candidates = mykb.get_alias_candidates("adam")
+    candidates = mykb._get_alias_candidates("adam")
    assert len(candidates) == 1
-    assert candidates[0].entity == q2_hash
+    assert candidates[0].entity_id == q2_hash
-    assert candidates[0].entity_ == "Q2"
+    assert candidates[0].entity_id_ == "Q2"
-    assert candidates[0].alias == adam_hash
+    assert candidates[0].alias == "adam"
    assert candidates[0].alias_ == "adam"
    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")
-        candidates = kb_new_vocab.get_alias_candidates("adam")
+        candidates = kb_new_vocab._get_alias_candidates("adam")
        assert len(candidates) == 1
-        assert candidates[0].entity == q2_hash
+        assert candidates[0].entity_id == q2_hash
-        assert candidates[0].entity_ == "Q2"
+        assert candidates[0].entity_id_ == "Q2"
-        assert candidates[0].alias == adam_hash
+        assert candidates[0].alias == "adam"
        assert candidates[0].alias_ == "adam"
        assert kb_new_vocab.get_vector("Q2") == [2]
        assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@ -595,20 +597,20 @@ def test_append_alias(nlp):
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
    # test the size of the relevant candidates
-    assert len(mykb.get_alias_candidates("douglas")) == 2
+    assert len(mykb._get_alias_candidates("douglas")) == 2
    # append an alias
    mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
    # test the size of the relevant candidates has been incremented
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
    # append the same alias-entity pair again should not work (will throw a warning)
    with pytest.warns(UserWarning):
        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
    # test the size of the relevant candidates remained unchanged
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036")
@ -905,11 +907,11 @@ def test_kb_to_bytes():
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
    assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
-    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
+    assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
-        kb_2.get_alias_candidates("Russ Cochran")
+        kb_2._get_alias_candidates("Russ Cochran")
    )
-    assert len(kb_1.get_alias_candidates("Randomness")) == len(
+    assert len(kb_1._get_alias_candidates("Randomness")) == len(
-        kb_2.get_alias_candidates("Randomness")
+        kb_2._get_alias_candidates("Randomness")
    )
@ -990,14 +992,11 @@ def test_scorer_links():
@pytest.mark.parametrize(
    "name,config",
    [
        ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
        ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
    ],
 )
 # fmt: on
 def test_legacy_architectures(name, config):
    from spacy_legacy.components.entity_linker import EntityLinker_v1
    # Ensure that the legacy architectures still work
    vector_length = 3
    nlp = English()
@ -1019,10 +1018,7 @@ def test_legacy_architectures(name, config):
        return mykb
    entity_linker = nlp.add_pipe(name, config={"model": config})
-    if config["@architectures"] == "spacy.EntityLinker.v1":
+    assert isinstance(entity_linker, EntityLinker)
        assert isinstance(entity_linker, EntityLinker_v1)
    else:
        assert isinstance(entity_linker, EntityLinker)
    entity_linker.set_kb(create_kb)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -9,6 +9,7 @@ from spacy.lang.en import English
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
 from spacy.strings import StringStore
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
@ -131,7 +132,7 @@ def test_issue5458():
    # Test that the noun chuncker does not generate overlapping spans
    # fmt: off
    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
+    vocab = Vocab(strings=StringStore(words))
    deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
    pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat():
    assert cats1["imperative"] < 0.9
    assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
    assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
 cfg_string_distillation = """
    [nlp]
    lang = "en"
    pipeline = ["tok2vec","tagger"]
    [components]
    [components.tagger]
    factory = "tagger"
    [components.tagger.model]
    @architectures = "spacy.Tagger.v2"
    nO = null
    [components.tagger.model.tok2vec]
    @architectures = "spacy.Tok2VecListener.v1"
    width = ${components.tok2vec.model.encode.width}
    [components.tok2vec]
    factory = "tok2vec"
    [components.tok2vec.model]
    @architectures = "spacy.Tok2Vec.v2"
    [components.tok2vec.model.embed]
    @architectures = "spacy.MultiHashEmbed.v2"
    width = ${components.tok2vec.model.encode.width}
    rows = [2000, 1000, 1000, 1000]
    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
    include_static_vectors = false
    [components.tok2vec.model.encode]
    @architectures = "spacy.MaxoutWindowEncoder.v2"
    width = 96
    depth = 4
    window_size = 1
    maxout_pieces = 3
    """
 def test_tok2vec_distillation_teacher_annotations():
    orig_config = Config().from_str(cfg_string_distillation)
    teacher_nlp = util.load_model_from_config(
        orig_config, auto_fill=True, validate=True
    )
    student_nlp = util.load_model_from_config(
        orig_config, auto_fill=True, validate=True
    )
    train_examples_teacher = []
    train_examples_student = []
    for t in TRAIN_DATA:
        train_examples_teacher.append(
            Example.from_dict(teacher_nlp.make_doc(t[0]), t[1])
        )
        train_examples_student.append(
            Example.from_dict(student_nlp.make_doc(t[0]), t[1])
        )
    optimizer = teacher_nlp.initialize(lambda: train_examples_teacher)
    student_nlp.initialize(lambda: train_examples_student)
    # Since Language.distill creates a copy of the examples to use as
    # its internal teacher/student docs, we'll need to monkey-patch the
    # tok2vec pipe's distill method.
    student_tok2vec = student_nlp.get_pipe("tok2vec")
    student_tok2vec._old_distill = student_tok2vec.distill
    def tok2vec_distill_wrapper(
        self,
        teacher_pipe,
        examples,
        **kwargs,
    ):
        assert all(not eg.reference.tensor.any() for eg in examples)
        out = self._old_distill(teacher_pipe, examples, **kwargs)
        assert all(eg.reference.tensor.any() for eg in examples)
        return out
    student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
    student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -1,7 +1,10 @@
-from typing import Callable
+from pathlib import Path
 from typing import Callable, Iterable, Any, Dict
-from spacy import util
+import srsly
-from spacy.util import ensure_path, registry, load_model_from_config
+
 from spacy import util, Errors
 from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
 from spacy.kb.kb_in_memory import InMemoryLookupKB
 from spacy.vocab import Vocab
 from thinc.api import Config
@ -63,19 +66,21 @@ def _check_kb(kb):
        assert alias_string not in kb.get_alias_strings()
    # check candidates & probabilities
-    candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
+    candidates = sorted(
        kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
    )
    assert len(candidates) == 2
-    assert candidates[0].entity_ == "Q007"
+    assert candidates[0].entity_id_ == "Q007"
    assert 6.999 < candidates[0].entity_freq < 7.01
    assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == "double07"
+    assert candidates[0].alias == "double07"
    assert 0.899 < candidates[0].prior_prob < 0.901
-    assert candidates[1].entity_ == "Q17"
+    assert candidates[1].entity_id_ == "Q17"
    assert 1.99 < candidates[1].entity_freq < 2.01
    assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == "double07"
+    assert candidates[1].alias == "double07"
    assert 0.099 < candidates[1].prior_prob < 0.101
@ -91,7 +96,10 @@ def test_serialize_subclassed_kb():
    [components.entity_linker]
    factory = "entity_linker"
-
+    
    [components.entity_linker.generate_empty_kb]
    @misc = "kb_test.CustomEmptyKB.v1"
    [initialize]
    [initialize.components]
@ -99,7 +107,7 @@ def test_serialize_subclassed_kb():
    [initialize.components.entity_linker]
    [initialize.components.entity_linker.kb_loader]
-    @misc = "spacy.CustomKB.v1"
+    @misc = "kb_test.CustomKB.v1"
    entity_vector_length = 342
    custom_field = 666
    """
@ -109,10 +117,57 @@ def test_serialize_subclassed_kb():
            super().__init__(vocab, entity_vector_length)
            self.custom_field = custom_field
-    @registry.misc("spacy.CustomKB.v1")
+        def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
            """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
            path = ensure_path(path)
            if not path.exists():
                path.mkdir(parents=True)
            if not path.is_dir():
                raise ValueError(Errors.E928.format(loc=path))
            def serialize_custom_fields(file_path: Path) -> None:
                srsly.write_json(file_path, {"custom_field": self.custom_field})
            serialize = {
                "contents": lambda p: self.write_contents(p),
                "strings.json": lambda p: self.vocab.strings.to_disk(p),
                "custom_fields": lambda p: serialize_custom_fields(p),
            }
            util.to_disk(path, serialize, exclude)
        def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
            """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
            path = ensure_path(path)
            if not path.exists():
                raise ValueError(Errors.E929.format(loc=path))
            if not path.is_dir():
                raise ValueError(Errors.E928.format(loc=path))
            def deserialize_custom_fields(file_path: Path) -> None:
                self.custom_field = srsly.read_json(file_path)["custom_field"]
            deserialize: Dict[str, Callable[[Any], Any]] = {
                "contents": lambda p: self.read_contents(p),
                "strings.json": lambda p: self.vocab.strings.from_disk(p),
                "custom_fields": lambda p: deserialize_custom_fields(p),
            }
            util.from_disk(path, deserialize, exclude)
    @registry.misc("kb_test.CustomEmptyKB.v1")
    def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
        def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
            return SubInMemoryLookupKB(
                vocab=vocab,
                entity_vector_length=entity_vector_length,
                custom_field=0,
            )
        return empty_kb_factory
    @registry.misc("kb_test.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
-    ) -> Callable[[Vocab], InMemoryLookupKB]:
+    ) -> Callable[[Vocab], SubInMemoryLookupKB]:
        def custom_kb_factory(vocab):
            kb = SubInMemoryLookupKB(
                vocab=vocab,
@ -139,6 +194,6 @@ def test_serialize_subclassed_kb():
        nlp2 = util.load_model_from_path(tmp_dir)
        entity_linker2 = nlp2.get_pipe("entity_linker")
        # After IO, the KB is the standard one
-        assert type(entity_linker2.kb) == InMemoryLookupKB
+        assert type(entity_linker2.kb) == SubInMemoryLookupKB
        assert entity_linker2.kb.entity_vector_length == 342
-        assert not hasattr(entity_linker2.kb, "custom_field")
+        assert entity_linker2.kb.custom_field == 666
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -13,8 +13,11 @@ from spacy.vocab import Vocab
 from ..util import make_tempdir
-test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
+test_strings = [
-test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
+    (StringStore(), StringStore()),
    (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])),
 ]
 test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")]
@pytest.mark.issue(599)
@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    vocab2 = Vocab(strings=strings2)
    vocab1_b = vocab1.to_bytes()
    vocab2_b = vocab2.to_bytes()
-    if strings1 == strings2:
+    if strings1.to_bytes() == strings2.to_bytes():
        assert vocab1_b == vocab2_b
    else:
        assert vocab1_b != vocab2_b
@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
 def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
+    s = next(iter(vocab1.strings))
-    assert vocab1[strings[0]].norm_ == lex_attr
+    vocab1[s].norm_ = lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    assert vocab1[s].norm_ == lex_attr
    assert vocab2[s].norm_ != lex_attr
    vocab2 = vocab2.from_bytes(vocab1.to_bytes())
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
 def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
+    s = next(iter(vocab1.strings))
-    assert vocab1[strings[0]].norm_ == lex_attr
+    vocab1[s].norm_ = lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    assert vocab1[s].norm_ == lex_attr
    assert vocab2[s].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / "vocab"
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
@pytest.mark.parametrize("strings1,strings2", test_strings)
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -2,7 +2,6 @@ import os
 import math
 from collections import Counter
 from typing import Tuple, List, Dict, Any
 import pkg_resources
 import time
 from pathlib import Path
@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys):
                )
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@pytest.mark.parametrize(
    "reqs,output",
    [
@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys):
    ],
 )
 def test_project_check_requirements(reqs, output):
    import pkg_resources
    # excessive guard against unlikely package name
    try:
        pkg_resources.require("spacyunknowndoesnotexist12345")
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -1,5 +1,7 @@
 import os
 from pathlib import Path
 import pytest
 import srsly
 from typer.testing import CliRunner
 from spacy.tokens import DocBin, Doc
@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
        # Instead of checking specific wording of the output, which may change,
        # we'll check that this section of the debug output is present.
        assert "= Trainable Lemmatizer =" in result_debug_data.stdout
 # project tests
 SAMPLE_PROJECT = {
    "title": "Sample project",
    "description": "This is a project for testing",
    "assets": [
        {
            "dest": "assets/spacy-readme.md",
            "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
            "checksum": "411b2c89ccf34288fae8ed126bf652f7",
        },
        {
            "dest": "assets/citation.cff",
            "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
            "checksum": "c996bfd80202d480eb2e592369714e5e",
            "extra": True,
        },
    ],
    "commands": [
        {
            "name": "ok",
            "help": "print ok",
            "script": ["python -c \"print('okokok')\""],
        },
        {
            "name": "create",
            "help": "make a file",
            "script": ["touch abc.txt"],
            "outputs": ["abc.txt"],
        },
        {
            "name": "clean",
            "help": "remove test file",
            "script": ["rm abc.txt"],
        },
    ],
 }
 SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
@pytest.fixture
 def project_dir():
    with make_tempdir() as pdir:
        (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
        yield pdir
 def test_project_document(project_dir):
    readme_path = project_dir / "README.md"
    assert not readme_path.exists(), "README already exists"
    result = CliRunner().invoke(
        app, ["project", "document", str(project_dir), "-o", str(readme_path)]
    )
    assert result.exit_code == 0
    assert readme_path.is_file()
    text = readme_path.read_text("utf-8")
    assert SAMPLE_PROJECT["description"] in text
 def test_project_assets(project_dir):
    asset_dir = project_dir / "assets"
    assert not asset_dir.exists(), "Assets dir is already present"
    result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
    assert result.exit_code == 0
    assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
    # check that extras work
    result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
    assert result.exit_code == 0
    assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
 def test_project_run(project_dir):
    # make sure dry run works
    test_file = project_dir / "abc.txt"
    result = CliRunner().invoke(
        app, ["project", "run", "--dry", "create", str(project_dir)]
    )
    assert result.exit_code == 0
    assert not test_file.is_file()
    result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
    assert result.exit_code == 0
    assert test_file.is_file()
    result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
    assert result.exit_code == 0
    assert "okokok" in result.stdout
@pytest.mark.parametrize(
    "options",
    [
        "",
        # "--sparse",
        "--branch v3",
        "--repo https://github.com/explosion/projects --branch v3",
    ],
 )
 def test_project_clone(options):
    with make_tempdir() as workspace:
        out = workspace / "project"
        target = "benchmarks/ner_conll03"
        if not options:
            options = []
        else:
            options = options.split()
        result = CliRunner().invoke(
            app, ["project", "clone", target, *options, str(out)]
        )
        assert result.exit_code == 0
        assert (out / "README.md").is_file()
 def test_project_push_pull(project_dir):
    proj = dict(SAMPLE_PROJECT)
    remote = "xyz"
    with make_tempdir() as remote_dir:
        proj["remotes"] = {remote: str(remote_dir)}
        proj_text = srsly.yaml_dumps(proj)
        (project_dir / "project.yml").write_text(proj_text)
        test_file = project_dir / "abc.txt"
        result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
        assert result.exit_code == 0
        assert test_file.is_file()
        result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
        assert result.exit_code == 0
        result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
        assert result.exit_code == 0
        assert not test_file.exists()
        result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
        assert result.exit_code == 0
        assert test_file.is_file()
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -98,7 +98,7 @@ def assert_sents_error(doc):
 def warn_error(proc_name, proc, docs, e):
    logger = logging.getLogger("spacy")
-    logger.warning(f"Trouble with component {proc_name}.")
+    logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2):
@pytest.mark.issue(600)
 def test_issue600():
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    vocab = Vocab()
    doc = Doc(vocab, words=["hello"])
    doc[0].tag_ = "NN"
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -105,6 +105,7 @@ class Doc:
        start_idx: int,
        end_idx: int,
        label: Union[int, str] = ...,
        *,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
@ -127,12 +128,12 @@ class Doc:
        blocked: Optional[List[Span]] = ...,
        missing: Optional[List[Span]] = ...,
        outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
    ) -> None: ...
    @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
    @property
-    def sents(self) -> Iterator[Span]: ...
+    def sents(self) -> Tuple[Span]: ...
    @property
    def lang(self) -> int: ...
    @property
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -520,7 +520,7 @@ cdef class Doc:
    def doc(self):
        return self
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
        """Create a `Span` object from the slice
        `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
        created.
@ -657,9 +657,6 @@ cdef class Doc:
            elif self.vocab.vectors.size > 0:
                self._vector = sum(t.vector for t in self) / len(self)
                return self._vector
            elif self.tensor.size > 0:
                self._vector = self.tensor.mean(axis=0)
                return self._vector
            else:
                return xp.zeros((self.vocab.vectors_length,), dtype="float32")
@ -706,10 +703,10 @@ cdef class Doc:
        return self.text
    property ents:
-        """The named entities in the document. Returns a tuple of named entity
+        """The named entities in the document. Returns a list of named entity
        `Span` objects, if the entity recognizer has been applied.
-        RETURNS (tuple): Entities in the document, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
        DOCS: https://spacy.io/api/doc#ents
        """
@ -867,7 +864,7 @@ cdef class Doc:
        NP-level coordination, no prepositional phrases, and no relative
        clauses.
-        YIELDS (Span): Noun chunks in the document.
+        RETURNS (Tuple[Span]): Noun chunks in the document.
        DOCS: https://spacy.io/api/doc#noun_chunks
        """
@ -876,36 +873,35 @@ cdef class Doc:
        # Accumulate the result before beginning to iterate over it. This
        # prevents the tokenization from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
+        # during the iteration.
        # its tokenization changing, so it's okay once we have the Span
        # objects. See Issue #375.
        spans = []
        for start, end, label in self.noun_chunks_iterator(self):
            spans.append(Span(self, start, end, label=label))
-        for span in spans:
+        return tuple(spans)
            yield span
    @property
    def sents(self):
        """Iterate over the sentences in the document. Yields sentence `Span`
        objects. Sentence spans have no label.
-        YIELDS (Span): Sentences in the document.
+        RETURNS (Tuple[Span]): Sentences in the document.
        DOCS: https://spacy.io/api/doc#sents
        """
        if not self.has_annotation("SENT_START"):
            raise ValueError(Errors.E030)
        if "sents" in self.user_hooks:
-            yield from self.user_hooks["sents"](self)
+            return tuple(self.user_hooks["sents"](self))
        else:
            start = 0
            spans = []
            for i in range(1, self.length):
                if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
+                    spans.append(Span(self, start, i))
                    start = i
            if start != self.length:
-                yield Span(self, start, self.length)
+                spans.append(Span(self, start, self.length))
            return tuple(spans)
    @property
    def lang(self):
@ -1605,7 +1601,7 @@ cdef class Doc:
        for span_group in doc_json.get("spans", {}):
            spans = []
            for span in doc_json["spans"][span_group]:
-                char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"])
+                char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
                if char_span is None:
                    raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
                spans.append(char_span)
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -74,6 +74,8 @@ class Span:
    @property
    def ents(self) -> Tuple[Span]: ...
    @property
    def sents(self) -> Tuple[Span]: ...
    @property
    def has_vector(self) -> bool: ...
    @property
    def vector(self) -> Floats1d: ...
@ -86,7 +88,7 @@ class Span:
    @property
    def text_with_ws(self) -> str: ...
    @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
    @property
    def root(self) -> Token: ...
    def char_span(
@ -94,6 +96,7 @@ class Span:
        start_idx: int,
        end_idx: int,
        label: Union[int, str] = ...,
        *,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -461,20 +461,21 @@ cdef class Span:
        """Obtain the sentences that contain this span. If the given span
        crosses sentence boundaries, return all sentences it is a part of.
-        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+        RETURNS (Tuple[Span]): All sentences that the span is a part of.
-         DOCS: https://spacy.io/api/span#sents
+        DOCS: https://spacy.io/api/span#sents
        """
        cdef int start
        cdef int i
        if "sents" in self.doc.user_span_hooks:
-            yield from self.doc.user_span_hooks["sents"](self)
+            return tuple(self.doc.user_span_hooks["sents"](self))
-        elif "sents" in self.doc.user_hooks:
+        spans = []
        if "sents" in self.doc.user_hooks:
            for sentence in self.doc.user_hooks["sents"](self.doc):
                if sentence.end > self.start:
                    if sentence.start < self.end or sentence.start == self.start == self.end:
-                        yield sentence
+                        spans.append(sentence)
                    else:
                        break
        else:
@ -489,12 +490,13 @@ cdef class Span:
            # Now, find all the sentences in the span
            for i in range(start + 1, self.doc.length):
                if self.doc.c[i].sent_start == 1:
-                    yield Span(self.doc, start, i)
+                    spans.append(Span(self.doc, start, i))
                    start = i
                    if start >= self.end:
                        break
            if start < self.end:
-                yield Span(self.doc, start, self.end)
+                spans.append(Span(self.doc, start, self.end))
        return tuple(spans)
    @property
@ -502,7 +504,7 @@ cdef class Span:
        """The named entities that fall completely within the span. Returns
        a tuple of `Span` objects.
-        RETURNS (tuple): Entities in the span, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
        DOCS: https://spacy.io/api/span#ents
        """
@ -517,7 +519,7 @@ cdef class Span:
                    ents.append(ent)
                else:
                    break
-        return ents
+        return tuple(ents)
    @property
    def has_vector(self):
@ -532,8 +534,6 @@ cdef class Span:
            return self.doc.user_span_hooks["has_vector"](self)
        elif self.vocab.vectors.size > 0:
            return any(token.has_vector for token in self)
        elif self.doc.tensor.size > 0:
            return True
        else:
            return False
@ -615,13 +615,15 @@ cdef class Span:
        NP-level coordination, no prepositional phrases, and no relative
        clauses.
-        YIELDS (Span): Noun chunks in the span.
+        RETURNS (Tuple[Span]): Noun chunks in the span.
        DOCS: https://spacy.io/api/span#noun_chunks
        """
        spans = []
        for span in self.doc.noun_chunks:
            if span.start >= self.start and span.end <= self.end:
-                yield span
+                spans.append(span)
        return tuple(spans)
    @property
    def root(self):
@ -666,11 +668,11 @@ cdef class Span:
        else:
            return self.doc[root]
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
        """Create a `Span` object from the slice `span.text[start : end]`.
-        start (int): The index of the first character of the span.
+        start_idx (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        end_idx (int): The index of the first character after the span.
        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -389,8 +389,6 @@ cdef class Token:
        """
        if "has_vector" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["has_vector"](self)
        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
            return True
        return self.vocab.has_vector(self.c.lex.orth)
    @property
@ -404,8 +402,6 @@ cdef class Token:
        """
        if "vector" in self.doc.user_token_hooks:
            return self.doc.user_token_hooks["vector"](self)
        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
            return self.doc.tensor[self.i]
        else:
            return self.vocab.get_vector(self.c.lex.orth)
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@ -11,7 +11,7 @@ def create_copy_from_base_model(
 ) -> Callable[[Language], Language]:
    def copy_from_base_model(nlp):
        if tokenizer:
-            logger.info(f"Copying tokenizer from: {tokenizer}")
+            logger.info("Copying tokenizer from: %s", tokenizer)
            base_nlp = load_model(tokenizer)
            if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
                nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@ -23,7 +23,7 @@ def create_copy_from_base_model(
                    )
                )
        if vocab:
-            logger.info(f"Copying vocab from: {vocab}")
+            logger.info("Copying vocab from: %s", vocab)
            # only reload if the vocab is from a different model
            if tokenizer != vocab:
                base_nlp = load_model(vocab)
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -29,7 +29,7 @@ def create_docbin_reader(
 ) -> Callable[["Language"], Iterable[Example]]:
    if path is None:
        raise ValueError(Errors.E913)
-    util.logger.debug(f"Loading corpus from path: {path}")
+    util.logger.debug("Loading corpus from path: %s", path)
    return Corpus(
        path,
        gold_preproc=gold_preproc,
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
    frozen_components = T["frozen_components"]
    # Sourced components that require resume_training
    resume_components = [p for p in sourced if p not in frozen_components]
-    logger.info(f"Pipeline: {nlp.pipe_names}")
+    logger.info("Pipeline: %s", nlp.pipe_names)
    if resume_components:
        with nlp.select_pipes(enable=resume_components):
-            logger.info(f"Resuming training for: {resume_components}")
+            logger.info("Resuming training for: %s", resume_components)
            nlp.resume_training(sgd=optimizer)
    # Make sure that listeners are defined before initializing further
    nlp._link_components()
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
        if T["max_epochs"] == -1:
            sample_size = 100
            logger.debug(
-                f"Due to streamed train corpus, using only first {sample_size} "
+                "Due to streamed train corpus, using only first %s examples for initialization. "
-                f"examples for initialization. If necessary, provide all labels "
+                "If necessary, provide all labels in [initialize]. "
-                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+                "More info: https://spacy.io/api/cli#init_labels",
                sample_size,
            )
            nlp.initialize(
                lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
            )
        else:
            nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+        logger.info("Initialized pipeline components: %s", nlp.pipe_names)
    # Detect components with listeners that are not frozen consistently
    for name, proc in nlp.pipeline:
        for listener in getattr(
@ -109,7 +110,7 @@ def init_vocab(
 ) -> None:
    if lookups:
        nlp.vocab.lookups = lookups
-        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
+        logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
    data_path = ensure_path(data)
    if data_path is not None:
        lex_attrs = srsly.read_jsonl(data_path)
@ -125,11 +126,11 @@ def init_vocab(
        else:
            oov_prob = DEFAULT_OOV_PROB
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+        logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
    logger.info("Created vocabulary")
    if vectors is not None:
        load_vectors_into_model(nlp, vectors)
-        logger.info(f"Added vectors: {vectors}")
+        logger.info("Added vectors: %s", vectors)
    # warn if source model vectors are not identical
    sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
    vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@ -191,7 +192,7 @@ def init_tok2vec(
    if weights_data is not None:
        layer = get_tok2vec_ref(nlp, P)
        layer.from_bytes(weights_data)
-        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
+        logger.info("Loaded pretrained weights from %s", init_tok2vec)
        return True
    return False
@ -215,13 +216,13 @@ def convert_vectors(
        nlp.vocab.deduplicate_vectors()
    else:
        if vectors_loc:
-            logger.info(f"Reading vectors from {vectors_loc}")
+            logger.info("Reading vectors from %s", vectors_loc)
            vectors_data, vector_keys, floret_settings = read_vectors(
                vectors_loc,
                truncate,
                mode=mode,
            )
-            logger.info(f"Loaded vectors from {vectors_loc}")
+            logger.info("Loaded vectors from %s", vectors_loc)
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None and mode != VectorsMode.floret:
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
            if subdir.exists():
                try:
                    shutil.rmtree(str(subdir))
-                    logger.debug(f"Removed existing output directory: {subdir}")
+                    logger.debug("Removed existing output directory: %s", subdir)
                except Exception as e:
                    raise IOError(Errors.E901.format(path=path)) from e
--- a/spacy/util.py
+++ b/spacy/util.py
@ -33,6 +33,7 @@ import inspect
 import pkgutil
 import logging
 import socket
 import stat
 try:
    import cupy.random
@ -139,8 +140,17 @@ class registry(thinc.registry):
        return func
    @classmethod
-    def find(cls, registry_name: str, func_name: str) -> Callable:
+    def find(
-        """Get info about a registered function from the registry."""
+        cls, registry_name: str, func_name: str
    ) -> Dict[str, Optional[Union[str, int]]]:
        """Find information about a registered function, including the
        module and path to the file it's defined in, the line number and the
        docstring, if available.
        registry_name (str): Name of the catalogue registry.
        func_name (str): Name of the registered function.
        RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
        """
        # We're overwriting this classmethod so we're able to provide more
        # specific error messages and implement a fallback to spacy-legacy.
        if not hasattr(cls, registry_name):
@ -1030,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]:
    """
    d = Path(tempfile.mkdtemp())
    yield d
    # On Windows, git clones use read-only files, which cause permission errors
    # when being deleted. This forcibly fixes permissions.
    def force_remove(rmfunc, path, ex):
        os.chmod(path, stat.S_IWRITE)
        rmfunc(path)
    try:
-        shutil.rmtree(str(d))
+        shutil.rmtree(str(d), onerror=force_remove)
    except PermissionError as e:
        warnings.warn(Warnings.W091.format(dir=d, msg=e))
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -26,7 +26,7 @@ class Vocab:
    def __init__(
        self,
        lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
-        strings: Optional[Union[List[str], StringStore]] = ...,
+        strings: Optional[StringStore] = ...,
        lookups: Optional[Lookups] = ...,
        oov_prob: float = ...,
        writing_system: Dict[str, Any] = ...,
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -49,9 +49,8 @@ cdef class Vocab:
    DOCS: https://spacy.io/api/vocab
    """
-    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
+    def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
-                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
+            oov_prob=-20., writing_system=None, get_noun_chunks=None):
                 **deprecated_kwargs):
        """Create the vocabulary.
        lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -69,16 +68,19 @@ cdef class Vocab:
        self.cfg = {'oov_prob': oov_prob}
        self.mem = Pool()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        self.length = 0
-        if strings:
+        if strings is None:
-            for string in strings:
+            self.strings = StringStore()
-                _ = self[string]
+        else:
            self.strings = strings
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings)
        self.vectors = Vectors(strings=self.strings)
        self.lookups = lookups
-        self.writing_system = writing_system
+        if writing_system is None:
            self.writing_system = {}
        else:
            self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
    property vectors:
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
-### spacy.EmptyKB.v1 {id="EmptyKB"}
+### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
 A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
-instance. This is the default when a new entity linker component is created.
+instance.
 | Name                   | Description                                                                         |
 | ---------------------- | ----------------------------------------------------------------------------------- |
 | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 ### spacy.EmptyKB.v2 {id="EmptyKB"}
 A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
 instance. This is the default when a new entity linker component is created. It
 returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
 ### spacy.KBFromFile.v1 {id="KBFromFile"}
 A function that reads an existing `KnowledgeBase` from file.
@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
 `CandidateGenerator` uses the text of a mention to find its potential aliases in
 the `KnowledgeBase`. Note that this function is case-dependent.
 ### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
 A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
 [`Span`](/api/span) objects denoting named entities, and returns a list of
 plausible [`Candidate`](/api/kb/#candidate) objects per specified
 [`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
 mention to find its potential aliases in the `KnowledgeBase`. Note that this
 function is case-dependent.
 ## Coreference {id="coref-architectures",tag="experimental"}
 A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
 ### project pull {id="project-pull",tag="command"}
 Download all files or directories listed as `outputs` for commands, unless they
-are not already present locally. When searching for files in the remote, `pull`
+are already present locally. When searching for files in the remote, `pull`
 won't just look at the output path, but will also consider the **command
 string** and the **hashes of the dependencies**. For instance, let's say you've
 previously pushed a checkpoint to the remote, but now you've changed some
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
-| Symbol    | Description                                                                                                          |
+| Symbol                                  | Description                                                                                                          |
-| --------- | -------------------------------------------------------------------------------------------------------------------- |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B`   | `A` is the immediate dependent of `B`.                                                                               |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
-| `A > B`   | `A` is the immediate head of `B`.                                                                                    |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
-| `A << B`  | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
-| `A >> B`  | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
-| `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
-| `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
-| `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
-| `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
-| `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
-| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
-| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
-| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -214,6 +214,7 @@ alignment mode `"strict".
 | `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
 | `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
 | _keyword-only_                           |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
@ -653,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
-Iterate over the base noun phrases in the document. Yields base noun-phrase
+Returns a tuple of the base noun phrases in the doc, if the document has been
-`Span` objects, if the document has been syntactically parsed. A base noun
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
-phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
+does not permit other NPs to be nested within it – so no NP-level coordination,
-nested within it – so no NP-level coordination, no prepositional phrases, and no
+no prepositional phrases, and no relative clauses.
 relative clauses.
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
 > assert chunks[1].text == "another phrase"
 > ```
-| Name       | Description                           |
+| Name        | Description                                  |
-| ---------- | ------------------------------------- |
+| ----------- | -------------------------------------------- |
-| **YIELDS** | Noun chunks in the document. ~~Span~~ |
+| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
 ## Doc.sents {id="sents",tag="property",model="sentences"}
-Iterate over the sentences in the document. Sentence spans have no label.
+Returns a tuple of the sentences in the document. Sentence spans have no label.
 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@ -696,9 +696,9 @@ will raise an error otherwise.
 > assert [s.root.text for s in sents] == ["is", "'s"]
 > ```
-| Name       | Description                         |
+| Name        | Description                                |
-| ---------- | ----------------------------------- |
+| ----------- | ------------------------------------------ |
-| **YIELDS** | Sentences in the document. ~~Span~~ |
+| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
-| Setting                                         | Description                                                                                                                                                                                                                                                                                 |
+| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
-| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
+| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
-| `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
+| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
-| `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
+| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
-| `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
+| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
-| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
+| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
-| `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
+| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
+| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
-| `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
+| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
+| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
+| `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
 | `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
 | `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
-| `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@ -10,9 +10,9 @@ version: 3.5
 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
 implements all of its methods. It stores all KB data in-memory and generates
-[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
+[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
-entity names. It's highly optimized for both a low memory footprint and speed of
+with entity names. It's highly optimized for both a low memory footprint and
-retrieval.
+speed of retrieval.
 ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
 ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
 Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate). Wraps
+of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
 [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 > #### Example
@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
 > candidates = kb.get_candidates(doc[0:2])
 > ```
-| Name        | Description                                                          |
+| Name        | Description                                                                          |
-| ----------- | -------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------ |
-| `mention`   | The textual mention or alias. ~~Span~~                               |
+| `mention`   | The textual mention or alias. ~~Span~~                                               |
-| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
@ -189,31 +189,16 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
 > from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
-| Name        | Description                                                                                  |
+| Name        | Description                                                                                                  |
-| ----------- | -------------------------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------------------------------ |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                                          |
-| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
+| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 ## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
 Given a certain textual mention as input, retrieve a list of candidate entities
 of type [`Candidate`](/api/kb#candidate).
 > #### Example
 >
 > ```python
 > candidates = kb.get_alias_candidates("Douglas")
 > ```
 | Name        | Description                                                   |
 | ----------- | ------------------------------------------------------------- |
 | `alias`     | The textual mention or alias. ~~str~~                         |
 | **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@ -93,33 +93,17 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
 > from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 | Name        | Description                                                                                  |
 | ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                          |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 ## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
 <Infobox variant="warning">
  This method is _not_ available from spaCy 3.5 onwards.
 </Infobox>
 From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
 [`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
 allow more flexibility in customizing knowledge bases. Some of its methods were
 moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
 one of those being `get_alias_candidates()`. This method is now available as
 [`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 Note:
 [`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
 defaults to
 [`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
 Given a certain entity ID, retrieve its pretrained entity vector.
@ -190,25 +174,25 @@ Restore the state of the knowledge base from a given directory. Note that the
 | `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                |
 | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          |
-## Candidate {id="candidate",tag="class"}
+## InMemoryCandidate {id="candidate",tag="class"}
-A `Candidate` object refers to a textual mention (alias) that may or may not be
+An `InMemoryCandidate` object refers to a textual mention (alias) that may or
-resolved to a specific entity from a `KnowledgeBase`. This will be used as input
+may not be resolved to a specific entity from a `KnowledgeBase`. This will be
-for the entity linking algorithm which will disambiguate the various candidates
+used as input for the entity linking algorithm which will disambiguate the
-to the correct one. Each candidate `(alias, entity)` pair is assigned to a
+various candidates to the correct one. Each candidate `(alias, entity)` pair is
-certain prior probability.
+assigned to a certain prior probability.
-### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
+### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
-Construct a `Candidate` object. Usually this constructor is not called directly,
+Construct an `InMemoryCandidate` object. Usually this constructor is not called
-but instead these objects are returned by the `get_candidates` method of the
+directly, but instead these objects are returned by the `get_candidates` method
-[`entity_linker`](/api/entitylinker) pipe.
+of the [`entity_linker`](/api/entitylinker) pipe.
 > #### Example
 >
 > ```python
-> from spacy.kb import Candidate
+> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
-> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
+> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
 > ```
 | Name          | Description                                                               |
@ -216,10 +200,10 @@ but instead these objects are returned by the `get_candidates` method of the
 | `kb`          | The knowledge base that defined this candidate. ~~KnowledgeBase~~         |
 | `entity_hash` | The hash of the entity's KB ID. ~~int~~                                   |
 | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~                     |
-| `alias_hash`  | The hash of the textual mention or alias. ~~int~~                         |
+| `alias_hash`  | The hash of the entity alias. ~~int~~                                     |
 | `prior_prob`  | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
-## Candidate attributes {id="candidate-attributes"}
+## InMemoryCandidate attributes {id="candidate-attributes"}
 | Name            | Description                                                              |
 | --------------- | ------------------------------------------------------------------------ |
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -188,9 +188,10 @@ the character indices don't map to a valid span.
 | Name                                            | Description                                                                                                                                                                                                                                                                  |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `start_idx`                                     | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `end_idx`                                       | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
 | _keyword-only_                                  |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
@ -274,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
 > assert ents[0].text == "Mr. Best"
 > ```
-| Name        | Description                                                       |
+| Name        | Description                                                  |
-| ----------- | ----------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------ |
-| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
+| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
 ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
-Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
+Returns a tuple of the base noun phrases in the span if the document has been
-objects, if the document has been syntactically parsed. A base noun phrase, or
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
-"NP chunk", is a noun phrase that does not permit other NPs to be nested within
+does not permit other NPs to be nested within it – so no NP-level coordination,
-it – so no NP-level coordination, no prepositional phrases, and no relative
+no prepositional phrases, and no relative clauses.
 clauses.
 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 has not been implemeted for the given language, a `NotImplementedError` is
@ -300,9 +300,9 @@ raised.
 > assert chunks[0].text == "another phrase"
 > ```
-| Name       | Description                       |
+| Name        | Description                              |
-| ---------- | --------------------------------- |
+| ----------- | ---------------------------------------- |
-| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
 ## Span.as_doc {id="as_doc",tag="method"}
@ -524,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
 ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
-Returns a generator over the sentences the span belongs to. This property is
+Returns a tuple of the sentences the span belongs to. This property is only
-only available when [sentence boundaries](/usage/linguistic-features#sbd) have
+available when [sentence boundaries](/usage/linguistic-features#sbd) have been
-been set on the document by the `parser`, `senter`, `sentencizer` or some custom
+set on the document by the `parser`, `senter`, `sentencizer` or some custom
 function. It will raise an error otherwise.
 If the span happens to cross sentence boundaries, all sentences the span
@ -540,9 +540,9 @@ overlaps with will be returned.
 > assert len(span.sents) == 2
 > ```
-| Name        | Description                                                                |
+| Name        | Description                                                   |
-| ----------- | -------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------- |
-| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
 ## Attributes {id="attributes"}
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 <Infobox variant="warning">
 Note that a `StringStore` instance is not static. It increases in size as texts
 with new tokens are processed.
 </Infobox>
 ## StringStore.\_\_init\_\_ {id="init",tag="method"}
 Create the `StringStore`.
--- a/website/docs/api/tok2vec.mdx
+++ b/website/docs/api/tok2vec.mdx
@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 ## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
 Performs an update of the student pipe's model using the student's distillation 
 examples and sets the annotations of the teacher's distillation examples using 
 the teacher pipe. 
 Unlike other trainable pipes, the student pipe doesn't directly learn its 
 representations from the teacher. However, since downstream pipes that do 
 perform distillation expect the tok2vec annotations to be present on the 
 correct distillation examples, we need to ensure that they are set beforehand.
 The distillation is performed on ~~Example~~ objects. The `Example.reference`
 and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
 same orthography. Even though the reference does not need have to have gold
 annotations, the teacher could adds its own annotations when necessary.
 This feature is experimental.
 > #### Example
 >
 > ```python
 > teacher_pipe = teacher.add_pipe("tok2vec")
 > student_pipe = student.add_pipe("tok2vec")
 > optimizer = nlp.resume_training()
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 | Name           | Description                                                                                                                                 |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~                                                                                 |
 | `examples`     | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
 | _keyword-only_ |                                                                                                                                             |
 | `drop`         | Dropout rate. ~~float~~                                                                                                                     |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
 | `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 ## Tok2Vec.pipe {id="pipe",tag="method"}
 Apply the pipe to a stream of documents. This usually happens under the hood
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -355,22 +355,22 @@ If a setting is not present in the options, the default value will be used.
 > displacy.serve(doc, style="dep", options=options)
 > ```
-| Name               | Description                                                                                                                                  |
+| Name               | Description                                                                                                                                                                                                                                   |
-| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             |
+| `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~                                                                                                              |
-| `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                      |
+| `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                                                                                                                       |
-| `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
+| `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~                                                                                                  |
-| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             |
+| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                                                                                                                              |
-| `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    |
+| `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
-| `color`            | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                                                                       |
+| `color`            | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
-| `bg`               | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~                                                                 |
+| `bg`               | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
-| `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                        |
+| `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
-| `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                             |
+| `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                                                                                                                              |
-| `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                          |
+| `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                                                                                                                           |
-| `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                 |
+| `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                                                                                                                  |
-| `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                           |
+| `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                                                                                                                            |
-| `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                     |
+| `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                                                                                                                      |
-| `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                           |
+| `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                                                                                                                            |
 #### Named Entity Visualizer options {id="displacy_options-ent"}
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 <Infobox variant="warning">
 Note that a `Vocab` instance is not static. It increases in size as texts with
 new tokens are processed.
 </Infobox>
 ## Vocab.\_\_init\_\_ {id="init",tag="method"}
 Create the vocabulary.
@ -17,14 +24,15 @@ Create the vocabulary.
 > #### Example
 >
 > ```python
 > from spacy.strings import StringStore
 > from spacy.vocab import Vocab
-> vocab = Vocab(strings=["hello", "world"])
+> vocab = Vocab(strings=StringStore(["hello", "world"]))
 > ```
 | Name               | Description                                                                                                                                                             |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~                                      |
-| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
+| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~                                                                         |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |
--- a/website/docs/usage/101/_vectors-similarity.mdx
+++ b/website/docs/usage/101/_vectors-similarity.mdx
@ -22,17 +22,20 @@ array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
 <Infobox title="Important note" variant="warning">
 To make them compact and fast, spaCy's small [pipeline packages](/models) (all
-packages that end in `sm`) **don't ship with word vectors**, and only include
+packages that end in `sm`) **don't ship with word vectors**. In order to use
-context-sensitive **tensors**. This means you can still use the `similarity()`
+`similarity()`, you need to download a larger pipeline package that includes
-methods to compare documents, spans and tokens – but the result won't be as
+vectors:
 good, and individual tokens won't have any vectors assigned. So in order to use
 _real_ word vectors, you need to download a larger pipeline package:
 ```diff
 - python -m spacy download en_core_web_sm
-+ python -m spacy download en_core_web_lg
+ python -m spacy download en_core_web_md
 ```
 In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
 backing off to context-sensitive tensors from the `tok2vec` component. These
 tensors do not work well for this purpose and this backoff has been removed in
 spaCy v4.
 </Infobox>
 Pipeline packages that come with built-in word vectors make them available as
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
-| Symbol    | Description                                                                                                          |
+| Symbol                                  | Description                                                                                                          |
-| --------- | -------------------------------------------------------------------------------------------------------------------- |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B`   | `A` is the immediate dependent of `B`.                                                                               |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
-| `A > B`   | `A` is the immediate head of `B`.                                                                                    |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
-| `A << B`  | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
-| `A >> B`  | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
-| `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
-| `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
-| `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
-| `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
-| `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
 | `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 ### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
 The saved pipeline now includes the `"entity_ruler"` in its
 [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
-file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
+file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
-all pipeline components will be restored and deserialized – including the entity
+pipeline components will be restored and deserialized – including the entity
 ruler. This lets you ship powerful pipeline packages with binary weights _and_
 rules included!
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@ -58,12 +58,12 @@ arcs.
 </Infobox>
-| Argument  | Description                                                                               |
+| Argument  | Description                                                                                                                                                                                                                                   |
-| --------- | ----------------------------------------------------------------------------------------- |
+| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
+| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
-| `color`   | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                    |
+| `color`   | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
-| `bg`      | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~              |
+| `bg`      | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
-| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                     |
+| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
 For a list of all available options, see the
 [`displacy` API documentation](/api/top-level#displacy_options).