Merge branch 'master' into pr/13418

2025-09-05 03:45:00 +03:00 · 2024-09-10 14:27:01 +02:00 · 2024-09-10 14:27:01 +02:00 · 37dd13a96b
commit 37dd13a96b
parent a8bf7d9036 7fbbb2002a
69 changed files with 5515 additions and 1063 deletions
--- a/.github/workflows/cibuildwheel.yml
+++ b/.github/workflows/cibuildwheel.yml
@ -0,0 +1,92 @@
+name: Build
+
+on:
+  push:
+    tags:
+      # ytf did they invent their own syntax that's almost regex?
+      # ** matches 'zero or more of any character'
+      - 'release-v[0-9]+.[0-9]+.[0-9]+**'
+      - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        # macos-13 is an intel runner, macos-14 is apple silicon
+        os: [ubuntu-latest, windows-latest, macos-13]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.19.1
+        env:
+          CIBW_SOME_OPTION: value
+        with:
+          package-dir: .
+          output-dir: wheelhouse
+          config-file: "{package}/pyproject.toml"
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
+          path: ./wheelhouse/*.whl
+
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build sdist
+        run: pipx run build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-sdist
+          path: dist/*.tar.gz
+  create_release:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      checks: write
+      actions: read
+      issues: read
+      packages: write
+      pull-requests: read
+      repository-projects: read
+      statuses: read
+    steps:
+      - name: Get the tag name and determine if it's a prerelease
+        id: get_tag_info
+        run: |
+          FULL_TAG=${GITHUB_REF#refs/tags/}
+          if [[ $FULL_TAG == release-* ]]; then
+            TAG_NAME=${FULL_TAG#release-}
+            IS_PRERELEASE=false
+          elif [[ $FULL_TAG == prerelease-* ]]; then
+            TAG_NAME=${FULL_TAG#prerelease-}
+            IS_PRERELEASE=true
+          else
+            echo "Tag does not match expected patterns" >&2
+            exit 1
+          fi
+          echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
+          echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
+          echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          # unpacks all CIBW artifacts into dist/
+          pattern: cibw-*
+          path: dist
+          merge-multiple: true
+      - name: Create Draft Release
+        id: create_release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          name: ${{ env.TAG_NAME }}
+          draft: true
+          prerelease: ${{ env.IS_PRERELEASE }}
+          files: "./dist/*" 
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -15,7 +15,7 @@ jobs:
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
--- a/.github/workflows/gputests.yml.disabled
+++ b/.github/workflows/gputests.yml.disabled
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -16,7 +16,7 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: dessant/lock-threads@v4
+      - uses: dessant/lock-threads@v5
        with:
          process-only: 'issues'
          issue-inactive-days: '30'
--- a/.github/workflows/publish_pypi.yml
+++ b/.github/workflows/publish_pypi.yml
@ -0,0 +1,29 @@
+# The cibuildwheel action triggers on creation of a release, this
+# triggers on publication.
+# The expected workflow is to create a draft release and let the wheels
+# upload, and then hit 'publish', which uploads to PyPi.
+
+on:
+  release:
+    types:
+      - published
+
+jobs:
+  upload_pypi:
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/spacy
+    permissions:
+      id-token: write
+      contents: read
+    if: github.event_name == 'release' && github.event.action == 'published'
+    # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
+    # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    steps:
+      - uses: robinraju/release-downloader@v1
+        with:
+          tag: ${{ github.event.release.tag_name }}
+          fileName: '*'
+          out-file-path: 'dist'
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/slowtests.yml.disabled
+++ b/.github/workflows/slowtests.yml.disabled
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -18,7 +18,7 @@ jobs:
        run: |
          echo "$GITHUB_CONTEXT"

-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: '3.10'
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -25,13 +25,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
-          architecture: x64

      - name: black
        run: |
@ -75,13 +74,12 @@ jobs:

    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
-          architecture: x64

      - name: Install dependencies
        run: |
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@ -20,13 +20,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
-          architecture: x64

      - name: Validate website/meta/universe.json
        run: |
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)

-Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,5 +11,58 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"

+[tool.cibuildwheel]
+build = "*"
+skip = "pp* cp36* cp37* cp38* *-win32"
+test-skip = ""
+free-threaded-support = false
+
+archs = ["native"]
+
+build-frontend = "default"
+config-settings = {}
+dependency-versions = "pinned"
+environment = { PIP_CONSTRAINT = "build-constraints.txt" }
+
+environment-pass = []
+build-verbosity = 0
+
+before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
+before-build = "pip install -r requirements.txt && python setup.py clean"
+repair-wheel-command = ""
+
+test-command = ""
+before-test = ""
+test-requires = []
+test-extras = []
+
+container-engine = "docker"
+
+manylinux-x86_64-image = "manylinux2014"
+manylinux-i686-image = "manylinux2014"
+manylinux-aarch64-image = "manylinux2014"
+manylinux-ppc64le-image = "manylinux2014"
+manylinux-s390x-image = "manylinux2014"
+manylinux-pypy_x86_64-image = "manylinux2014"
+manylinux-pypy_i686-image = "manylinux2014"
+manylinux-pypy_aarch64-image = "manylinux2014"
+
+musllinux-x86_64-image = "musllinux_1_2"
+musllinux-i686-image = "musllinux_1_2"
+musllinux-aarch64-image = "musllinux_1_2"
+musllinux-ppc64le-image = "musllinux_1_2"
+musllinux-s390x-image = "musllinux_1_2"
+
+[tool.cibuildwheel.linux]
+repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
+
+[tool.cibuildwheel.macos]
+repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
+
+[tool.cibuildwheel.windows]
+
+[tool.cibuildwheel.pyodide]
+
+
 [tool.isort]
 profile = "black"
--- a/requirements.txt
+++ b/requirements.txt
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.10.0
+typer>=0.3.0,<1.0.0
 weasel>=0.1.0,<0.5.0
 # Third party dependencies
 numpy>=1.15.0; python_version < "3.9"
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -22,6 +22,7 @@ classifiers =
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -55,7 +56,7 @@ install_requires =
    catalogue>=2.0.6,<2.1.0
    weasel>=0.1.0,<0.5.0
    # Third-party dependencies
-    typer>=0.3.0,<0.10.0
+    typer>=0.3.0,<1.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0; python_version < "3.9"
    numpy>=1.19.0; python_version >= "3.9"
@ -65,7 +66,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "3.8.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -39,7 +39,7 @@ def find_threshold_cli(
    # fmt: on
 ):
    """
-    Runs prediction trials for a trained model with varying tresholds to maximize
+    Runs prediction trials for a trained model with varying thresholds to maximize
    the specified metric. The search space for the threshold is traversed linearly
    from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
    (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@ -81,7 +81,7 @@ def find_threshold(
    silent: bool = True,
 ) -> Tuple[float, float, Dict[float, float]]:
    """
-    Runs prediction trials for models with varying tresholds to maximize the specified metric.
+    Runs prediction trials for models with varying thresholds to maximize the specified metric.
    model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
    data_path (Path): Path to file with DocBin with docs to use for threshold search.
    pipe_name (str): Name of pipe to examine thresholds for.
--- a/spacy/lang/bo/init.py
+++ b/spacy/lang/bo/init.py
@ -0,0 +1,16 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+
+
+class TibetanDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+
+
+class Tibetan(Language):
+    lang = "bo"
+    Defaults = TibetanDefaults
+
+
+__all__ = ["Tibetan"]
--- a/spacy/lang/bo/examples.py
+++ b/spacy/lang/bo/examples.py
@ -0,0 +1,16 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.bo.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
+    "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
+    "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
+    "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
+    "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
+    "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
+]
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -0,0 +1,65 @@
+from ...attrs import LIKE_NUM
+
+# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
+
+_num_words = [
+    "ཀླད་ཀོར་",
+    "གཅིག་",
+    "གཉིས་",
+    "གསུམ་",
+    "བཞི་",
+    "ལྔ་",
+    "དྲུག་",
+    "བདུན་",
+    "བརྒྱད་",
+    "དགུ་",
+    "བཅུ་",
+    "བཅུ་གཅིག་",
+    "བཅུ་གཉིས་",
+    "བཅུ་གསུམ་",
+    "བཅུ་བཞི་",
+    "བཅུ་ལྔ་",
+    "བཅུ་དྲུག་",
+    "བཅུ་བདུན་",
+    "བཅུ་པརྒྱད",
+    "བཅུ་དགུ་",
+    "ཉི་ཤུ་",
+    "སུམ་ཅུ",
+    "བཞི་བཅུ",
+    "ལྔ་བཅུ",
+    "དྲུག་ཅུ",
+    "བདུན་ཅུ",
+    "བརྒྱད་ཅུ",
+    "དགུ་བཅུ",
+    "བརྒྱ་",
+    "སྟོང་",
+    "ཁྲི་",
+    "ས་ཡ་",
+    "	བྱེ་བ་",
+    "དུང་ཕྱུར་",
+    "ཐེར་འབུམ་",
+    "ཐེར་འབུམ་ཆེན་པོ་",
+    "ཁྲག་ཁྲིག་",
+    "ཁྲག་ཁྲིག་ཆེན་པོ་",
+]
+
+
+def like_num(text):
+    """
+    Check if text resembles a number
+    """
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/bo/stop_words.py
+++ b/spacy/lang/bo/stop_words.py
@ -0,0 +1,198 @@
+# Source: https://zenodo.org/records/10148636
+
+STOP_WORDS = set(
+    """
+འི་
+།
+དུ་
+གིས་
+སོགས་
+ཏེ
+གི་
+རྣམས་
+ནི
+ཀུན་
+ཡི་
+འདི
+ཀྱི་
+སྙེད་
+པས་
+གཞན་
+ཀྱིས་
+ཡི
+ལ
+ནི་
+དང་
+སོགས
+ཅིང་
+ར
+དུ
+མི་
+སུ་
+བཅས་
+ཡོངས་
+ལས
+ཙམ་
+གྱིས་
+དེ་
+ཡང་
+མཐའ་དག་
+ཏུ་
+ཉིད་
+ས
+ཏེ་
+གྱི་
+སྤྱི
+དེ
+ཀ་
+ཡིན་
+ཞིང་
+འདི་
+རུང་
+རང་
+ཞིག་
+སྟེ
+སྟེ་
+ན་རེ
+ངམ
+ཤིང་
+དག་
+ཏོ
+རེ་
+འང་
+ཀྱང་
+ལགས་པ
+ཚུ
+དོ
+ཡིན་པ
+རེ
+ན་རེ་
+ཨེ་
+ཚང་མ
+ཐམས་ཅད་
+དམ་
+འོ་
+ཅིག་
+གྱིན་
+ཡིན
+ན
+ཁོ་ན་
+འམ་
+ཀྱིན་
+ལོ
+ཀྱིས
+བས་
+ལགས་
+ཤིག
+གིས
+ཀི་
+སྣ་ཚོགས་
+རྣམས
+སྙེད་པ
+ཡིས་
+གྱི
+གི
+བམ་
+ཤིག་
+རེ་རེ་
+ནམ
+མིན་
+ནམ་
+ངམ་
+རུ་
+འགའ་
+ཀུན
+ཤས་
+ཏུ
+ཡིས
+གིན་
+གམ་
+འོ
+ཡིན་པ་
+མིན
+ལགས
+གྱིས
+ཅང་
+འགའ
+སམ་
+ཞིག
+འང
+ལས་ཆེ་
+འཕྲལ་
+བར་
+རུ
+དང
+ཡ
+འག
+སམ
+ཀ
+ཅུང་ཟད་
+ཅིག
+ཉིད
+དུ་མ
+མ
+ཡིན་བ
+འམ
+མམ
+དམ
+དག
+ཁོ་ན
+ཀྱི
+ལམ
+ཕྱི་
+ནང་
+ཙམ
+ནོ་
+སོ་
+རམ་
+བོ་
+ཨང་
+ཕྱི
+ཏོ་
+ཚོ
+ལ་ལ་
+ཚོ་
+ཅིང
+མ་གི་
+གེ
+གོ
+ཡིན་ལུགས་
+རོ་
+བོ
+ལགས་པ་
+པས
+རབ་
+འི
+རམ
+བས
+གཞན
+སྙེད་པ་
+འབའ་
+མཾ་
+པོ
+ག་
+ག
+གམ
+སྤྱི་
+བམ
+མོ་
+ཙམ་པ་
+ཤ་སྟག་
+མམ་
+རེ་རེ
+སྙེད
+ཏམ་
+ངོ
+གྲང་
+ཏ་རེ
+ཏམ
+ཁ་
+ངེ་
+ཅོག་
+རིལ་
+ཉུང་ཤས་
+གིང་
+ཚ་
+ཀྱང
+""".split()
+)
--- a/spacy/lang/gd/init.py
+++ b/spacy/lang/gd/init.py
@ -0,0 +1,18 @@
+from typing import Optional
+
+from ...language import BaseDefaults, Language
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+class ScottishDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
+
+
+class Scottish(Language):
+    lang = "gd"
+    Defaults = ScottishDefaults
+
+
+__all__ = ["Scottish"]
--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -0,0 +1,388 @@
+STOP_WORDS = set(
+    """
+'ad
+'ar
+'d # iad
+'g # ag
+'ga
+'gam
+'gan
+'gar
+'gur
+'m # am
+'n # an
+'n seo
+'na
+'nad
+'nam
+'nan
+'nar
+'nuair
+'nur
+'s
+'sa
+'san
+'sann
+'se
+'sna
+a
+a'
+a'd # agad
+a'm # agam
+a-chèile
+a-seo
+a-sin
+a-siud
+a chionn
+a chionn 's
+a chèile
+a chéile
+a dh'
+a h-uile
+a seo
+ac' # aca
+aca
+aca-san
+acasan
+ach
+ag
+agad
+agad-sa
+agads'
+agadsa
+agaibh
+agaibhse
+againn
+againne
+agam
+agam-sa
+agams'
+agamsa
+agus
+aice
+aice-se
+aicese
+aig
+aig' # aige
+aige
+aige-san
+aigesan
+air
+air-san
+air neo
+airsan
+am
+an
+an seo
+an sin
+an siud
+an uair
+ann
+ann a
+ann a'
+ann a shin
+ann am
+ann an
+annad
+annam
+annam-s'
+annamsa
+anns
+anns an
+annta
+aon
+ar
+as
+asad
+asda
+asta
+b'
+bho
+bhon
+bhuaidhe # bhuaithe
+bhuainn
+bhuaipe
+bhuaithe
+bhuapa
+bhur
+brì
+bu
+c'à
+car son
+carson
+cha
+chan
+chionn
+choir
+chon
+chun
+chèile
+chéile
+chòir
+cia mheud
+ciamar
+co-dhiubh
+cuide
+cuin
+cuin'
+cuine
+cà
+cà'
+càil
+càit
+càit'
+càite
+cò
+cò mheud
+có
+d'
+da
+de
+dh'
+dha
+dhaibh
+dhaibh-san
+dhaibhsan
+dhan
+dhasan
+dhe
+dhen
+dheth
+dhi
+dhiom
+dhiot
+dhith
+dhiubh
+dhomh
+dhomh-s'
+dhomhsa
+dhu'sa # dhut-sa
+dhuibh
+dhuibhse
+dhuinn
+dhuinne
+dhuit
+dhut
+dhutsa
+dhut-sa
+dhà
+dhà-san
+dhàsan
+dhòmhsa
+diubh
+do
+docha
+don
+dà
+dè
+dè mar
+dé
+dé mar
+dòch'
+dòcha
+e
+eadar
+eatarra
+eatorra
+eile
+esan
+fa
+far
+feud
+fhad
+fheudar
+fhearr
+fhein
+fheudar
+fheàrr
+fhèin
+fhéin
+fhìn
+fo
+fodha
+fodhainn
+foipe
+fon
+fèin
+ga
+gach
+gam
+gan
+ge brith
+ged
+gu
+gu dè
+gu ruige
+gun
+gur
+gus
+i
+iad
+iadsan
+innte
+is
+ise
+le
+leam
+leam-sa
+leamsa
+leat
+leat-sa
+leatha
+leatsa
+leibh
+leis
+leis-san
+leoth'
+leotha
+leotha-san
+linn
+m'
+m'a
+ma
+mac
+man
+mar
+mas
+mathaid
+mi
+mis'
+mise
+mo
+mu
+mu 'n
+mun
+mur
+mura
+mus
+na
+na b'
+na bu
+na iad
+nach
+nad
+nam
+nan
+nar
+nas
+neo
+no
+nuair
+o
+o'n
+oir
+oirbh
+oirbh-se
+oirnn
+oirnne
+oirre
+on
+orm
+orm-sa
+ormsa
+orra
+orra-san
+orrasan
+ort
+os
+r'
+ri
+ribh
+rinn
+ris
+rithe
+rithe-se
+rium
+rium-sa
+riums'
+riumsa
+riut
+riuth'
+riutha
+riuthasan
+ro
+ro'n
+roimh
+roimhe
+romhainn
+romham
+romhpa
+ron
+ruibh
+ruinn
+ruinne
+sa
+san
+sann
+se
+seach
+seo
+seothach
+shin
+sibh
+sibh-se
+sibhse
+sin
+sineach
+sinn
+sinne
+siod
+siodach
+siud
+siudach
+sna # ann an
+sè
+t'
+tarsaing
+tarsainn
+tarsuinn
+thar
+thoigh
+thro
+thu
+thuc'
+thuca
+thugad
+thugaibh
+thugainn
+thugam
+thugamsa
+thuice
+thuige
+thus'
+thusa
+timcheall
+toigh
+toil
+tro
+tro' # troimh
+troimh
+troimhe
+tron
+tu
+tusa
+uair
+ud
+ugaibh
+ugam-s'
+ugam-sa
+uice
+uige
+uige-san
+umad
+unnta # ann an
+ur
+urrainn
+à
+às
+àsan
+á
+ás
+è
+ì
+ò
+ó
+""".split(
+        "\n"
+    )
+)
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -0,0 +1,16 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+
+
+class KurmanjiDefaults(BaseDefaults):
+    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS
+
+
+class Kurmanji(Language):
+    lang = "kmr"
+    Defaults = KurmanjiDefaults
+
+
+__all__ = ["Kurmanji"]
--- a/spacy/lang/kmr/examples.py
+++ b/spacy/lang/kmr/examples.py
@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.kmr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
+    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
+    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
+    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
+    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
+    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
+    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
+    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
+]
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,138 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "sifir",
+    "yek",
+    "du",
+    "sê",
+    "çar",
+    "pênc",
+    "şeş",
+    "heft",
+    "heşt",
+    "neh",
+    "deh",
+    "yazde",
+    "dazde",
+    "sêzde",
+    "çarde",
+    "pazde",
+    "şazde",
+    "hevde",
+    "hejde",
+    "nozde",
+    "bîst",
+    "sî",
+    "çil",
+    "pêncî",
+    "şêst",
+    "heftê",
+    "heştê",
+    "nod",
+    "sed",
+    "hezar",
+    "milyon",
+    "milyar",
+]
+
+_ordinal_words = [
+    "yekem",
+    "yekemîn",
+    "duyem",
+    "duyemîn",
+    "sêyem",
+    "sêyemîn",
+    "çarem",
+    "çaremîn",
+    "pêncem",
+    "pêncemîn",
+    "şeşem",
+    "şeşemîn",
+    "heftem",
+    "heftemîn",
+    "heştem",
+    "heştemîn",
+    "nehem",
+    "nehemîn",
+    "dehem",
+    "dehemîn",
+    "yazdehem",
+    "yazdehemîn",
+    "dazdehem",
+    "dazdehemîn",
+    "sêzdehem",
+    "sêzdehemîn",
+    "çardehem",
+    "çardehemîn",
+    "pazdehem",
+    "pazdehemîn",
+    "şanzdehem",
+    "şanzdehemîn",
+    "hevdehem",
+    "hevdehemîn",
+    "hejdehem",
+    "hejdehemîn",
+    "nozdehem",
+    "nozdehemîn",
+    "bîstem",
+    "bîstemîn",
+    "sîyem",
+    "sîyemîn",
+    "çilem",
+    "çilemîn",
+    "pêncîyem",
+    "pênciyemîn",
+    "şêstem",
+    "şêstemîn",
+    "heftêyem",
+    "heftêyemîn",
+    "heştêyem",
+    "heştêyemîn",
+    "notem",
+    "notemîn",
+    "sedem",
+    "sedemîn",
+    "hezarem",
+    "hezaremîn",
+    "milyonem",
+    "milyonemîn",
+    "milyarem",
+    "milyaremîn",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+
+    if is_digit(text_lower):
+        return True
+
+    return False
+
+
+def is_digit(text):
+    endings = ("em", "yem", "emîn", "yemîn")
+    for ending in endings:
+        to = len(ending)
+        if text.endswith(ending) and text[:-to].isdigit():
+            return True
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/kmr/stop_words.py
+++ b/spacy/lang/kmr/stop_words.py
@ -0,0 +1,44 @@
+STOP_WORDS = set(
+    """
+û
+li
+bi
+di
+da
+de
+ji
+ku
+ew
+ez
+tu
+em
+hûn
+ew
+ev
+min
+te
+wî
+wê
+me
+we
+wan
+vê
+vî
+va
+çi
+kî
+kê
+çawa
+çima
+kengî
+li ku
+çend
+çiqas
+her
+hin
+gelek
+hemû
+kes
+tişt
+""".split()
+)
--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return MacedonianLemmatizer(lookups)
-
-
 class Macedonian(Language):
    lang = "mk"
    Defaults = MacedonianDefaults
--- a/spacy/language.py
+++ b/spacy/language.py
@ -5,7 +5,7 @@ import multiprocessing as mp
 import random
 import traceback
 import warnings
-from contextlib import contextmanager
+from contextlib import ExitStack, contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from itertools import chain, cycle
@ -31,6 +31,7 @@ from typing import (
 )

 import srsly
+from cymem.cymem import Pool
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops

 from . import about, ty, util
@ -2091,6 +2092,38 @@ class Language:
                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
                tok2vec.remove_listener(listener, pipe_name)

+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
+        """Begin a block where all resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+
+        Example
+        -------
+        >>> with nlp.memory_zone():
+        ...     for doc in nlp.pipe(texts):
+        ...        process_my_doc(doc)
+        >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
+        """
+        if mem is None:
+            mem = Pool()
+        # The ExitStack allows programmatic nested context managers.
+        # We don't know how many we need, so it would be awkward to have
+        # them as nested blocks.
+        with ExitStack() as stack:
+            contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
+            if hasattr(self.tokenizer, "memory_zone"):
+                contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
+            for _, pipe in self.pipeline:
+                if hasattr(pipe, "memory_zone"):
+                    contexts.append(stack.enter_context(pipe.memory_zone(mem)))
+            yield mem
+
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -164,41 +164,44 @@ cdef class Lexeme:
        vector = self.vector
        return numpy.sqrt((vector**2).sum())

-    property vector:
+    @property
+    def vector(self):
        """A real-valued meaning representation.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the lexeme's semantics.
        """
-        def __get__(self):
        cdef int length = self.vocab.vectors_length
        if length == 0:
            raise ValueError(Errors.E010)
        return self.vocab.get_vector(self.c.orth)

-        def __set__(self, vector):
+    @vector.setter
+    def vector(self, vector):
        if len(vector) != self.vocab.vectors_length:
            raise ValueError(Errors.E073.format(new_length=len(vector),
                                                length=self.vocab.vectors_length))
        self.vocab.set_vector(self.c.orth, vector)

-    property rank:
+    @property
+    def rank(self):
        """RETURNS (str): Sequential ID of the lexeme's lexical type, used
            to index into tables, e.g. for word vectors."""
-        def __get__(self):
        return self.c.id

-        def __set__(self, value):
+    @rank.setter
+    def rank(self, value):
        self.c.id = value

-    property sentiment:
+    @property
+    def sentiment(self):
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the lexeme."""
-        def __get__(self):
        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
        return sentiment_table.get(self.c.orth, 0.0)

-        def __set__(self, float x):
+    @sentiment.setter
+    def sentiment(self, float x):
        if "lexeme_sentiment" not in self.vocab.lookups:
            self.vocab.lookups.add_table("lexeme_sentiment")
        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
@ -216,151 +219,166 @@ cdef class Lexeme:
        """RETURNS (str): The original verbatim text of the lexeme."""
        return self.orth_

-    property lower:
+    @property
+    def lower(self):
        """RETURNS (uint64): Lowercase form of the lexeme."""
-        def __get__(self):
        return self.c.lower

-        def __set__(self, attr_t x):
+    @lower.setter
+    def lower(self, attr_t x):
        self.c.lower = x

-    property norm:
+    @property
+    def norm(self):
        """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
-        def __get__(self):
        return self.c.norm

-        def __set__(self, attr_t x):
+    @norm.setter
+    def norm(self, attr_t x):
        if "lexeme_norm" not in self.vocab.lookups:
            self.vocab.lookups.add_table("lexeme_norm")
        norm_table = self.vocab.lookups.get_table("lexeme_norm")
        norm_table[self.c.orth] = self.vocab.strings[x]
        self.c.norm = x

-    property shape:
+    @property
+    def shape(self):
        """RETURNS (uint64): Transform of the word's string, to show
            orthographic features.
        """
-        def __get__(self):
        return self.c.shape

-        def __set__(self, attr_t x):
+    @shape.setter
+    def shape(self, attr_t x):
        self.c.shape = x

-    property prefix:
+    @property
+    def prefix(self):
        """RETURNS (uint64): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
-        def __get__(self):
        return self.c.prefix

-        def __set__(self, attr_t x):
+    @prefix.setter
+    def prefix(self, attr_t x):
        self.c.prefix = x

-    property suffix:
+    @property
+    def suffix(self):
        """RETURNS (uint64): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
-        def __get__(self):
        return self.c.suffix

-        def __set__(self, attr_t x):
+    @suffix.setter
+    def suffix(self, attr_t x):
        self.c.suffix = x

-    property cluster:
+    @property
+    def cluster(self):
        """RETURNS (int): Brown cluster ID."""
-        def __get__(self):
        cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
        return cluster_table.get(self.c.orth, 0)

-        def __set__(self, int x):
+    @cluster.setter
+    def cluster(self, int x):
        cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
        cluster_table[self.c.orth] = x

-    property lang:
+    @property
+    def lang(self):
        """RETURNS (uint64): Language of the parent vocabulary."""
-        def __get__(self):
        return self.c.lang

-        def __set__(self, attr_t x):
+    @lang.setter
+    def lang(self, attr_t x):
        self.c.lang = x

-    property prob:
+    @property
+    def prob(self):
        """RETURNS (float): Smoothed log probability estimate of the lexeme's
            type."""
-        def __get__(self):
        prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
        settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
        default_oov_prob = settings_table.get("oov_prob", -20.0)
        return prob_table.get(self.c.orth, default_oov_prob)

-        def __set__(self, float x):
+    @prob.setter
+    def prob(self, float x):
        prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
        prob_table[self.c.orth] = x

-    property lower_:
+    @property
+    def lower_(self):
        """RETURNS (str): Lowercase form of the word."""
-        def __get__(self):
        return self.vocab.strings[self.c.lower]

-        def __set__(self, str x):
+    @lower_.setter
+    def lower_(self, str x):
        self.c.lower = self.vocab.strings.add(x)

-    property norm_:
+    @property
+    def norm_(self):
        """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
-        def __get__(self):
        return self.vocab.strings[self.c.norm]

-        def __set__(self, str x):
+    @norm_.setter
+    def norm_(self, str x):
        self.norm = self.vocab.strings.add(x)

-    property shape_:
+    @property
+    def shape_(self):
        """RETURNS (str): Transform of the word's string, to show
            orthographic features.
        """
-        def __get__(self):
        return self.vocab.strings[self.c.shape]

-        def __set__(self, str x):
+    @shape_.setter
+    def shape_(self, str x):
        self.c.shape = self.vocab.strings.add(x)

-    property prefix_:
+    @property
+    def prefix_(self):
        """RETURNS (str): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
-        def __get__(self):
        return self.vocab.strings[self.c.prefix]

-        def __set__(self, str x):
+    @prefix_.setter
+    def prefix_(self, str x):
        self.c.prefix = self.vocab.strings.add(x)

-    property suffix_:
+    @property
+    def suffix_(self):
        """RETURNS (str): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
-        def __get__(self):
        return self.vocab.strings[self.c.suffix]

-        def __set__(self, str x):
+    @suffix_.setter
+    def suffix_(self, str x):
        self.c.suffix = self.vocab.strings.add(x)

-    property lang_:
+    @property
+    def lang_(self):
        """RETURNS (str): Language of the parent vocabulary."""
-        def __get__(self):
        return self.vocab.strings[self.c.lang]

-        def __set__(self, str x):
+    @lang_.setter
+    def lang_(self, str x):
        self.c.lang = self.vocab.strings.add(x)

-    property flags:
+    @property
+    def flags(self):
        """RETURNS (uint64): Container of the lexeme's binary flags."""
-        def __get__(self):
        return self.c.flags

-        def __set__(self, flags_t x):
+    @flags.setter
+    def flags(self, flags_t x):
        self.c.flags = x

    @property
@ -368,154 +386,171 @@ cdef class Lexeme:
        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
        return self.orth not in self.vocab.vectors

-    property is_stop:
+    @property
+    def is_stop(self):
        """RETURNS (bool): Whether the lexeme is a stop word."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_STOP)

-        def __set__(self, bint x):
+    @is_stop.setter
+    def is_stop(self, bint x):
        Lexeme.c_set_flag(self.c, IS_STOP, x)

-    property is_alpha:
+    @property
+    def is_alpha(self):
        """RETURNS (bool): Whether the lexeme consists of alphabetic
            characters. Equivalent to `lexeme.text.isalpha()`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_ALPHA)

-        def __set__(self, bint x):
+    @is_alpha.setter
+    def is_alpha(self, bint x):
        Lexeme.c_set_flag(self.c, IS_ALPHA, x)

-    property is_ascii:
+    @property
+    def is_ascii(self):
        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_ASCII)

-        def __set__(self, bint x):
+    @is_ascii.setter
+    def is_ascii(self, bint x):
        Lexeme.c_set_flag(self.c, IS_ASCII, x)

-    property is_digit:
+    @property
+    def is_digit(self):
        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
            to `lexeme.text.isdigit()`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_DIGIT)

-        def __set__(self, bint x):
+    @is_digit.setter
+    def is_digit(self, bint x):
        Lexeme.c_set_flag(self.c, IS_DIGIT, x)

-    property is_lower:
+    @property
+    def is_lower(self):
        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
            `lexeme.text.islower()`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_LOWER)

-        def __set__(self, bint x):
+    @is_lower.setter
+    def is_lower(self, bint x):
        Lexeme.c_set_flag(self.c, IS_LOWER, x)

-    property is_upper:
+    @property
+    def is_upper(self):
        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
            `lexeme.text.isupper()`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_UPPER)

-        def __set__(self, bint x):
+    @is_upper.setter
+    def is_upper(self, bint x):
        Lexeme.c_set_flag(self.c, IS_UPPER, x)

-    property is_title:
+    @property
+    def is_title(self):
        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
            `lexeme.text.istitle()`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_TITLE)

-        def __set__(self, bint x):
+    @is_title.setter
+    def is_title(self, bint x):
        Lexeme.c_set_flag(self.c, IS_TITLE, x)

-    property is_punct:
+    @property
+    def is_punct(self):
        """RETURNS (bool): Whether the lexeme is punctuation."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_PUNCT)

-        def __set__(self, bint x):
+    @is_punct.setter
+    def is_punct(self, bint x):
        Lexeme.c_set_flag(self.c, IS_PUNCT, x)

-    property is_space:
+    @property
+    def is_space(self):
        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
            Equivalent to `lexeme.text.isspace()`.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_SPACE)

-        def __set__(self, bint x):
+    @is_space.setter
+    def is_space(self, bint x):
        Lexeme.c_set_flag(self.c, IS_SPACE, x)

-    property is_bracket:
+    @property
+    def is_bracket(self):
        """RETURNS (bool): Whether the lexeme is a bracket."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_BRACKET)

-        def __set__(self, bint x):
+    @is_bracket.setter
+    def is_bracket(self, bint x):
        Lexeme.c_set_flag(self.c, IS_BRACKET, x)

-    property is_quote:
+    @property
+    def is_quote(self):
        """RETURNS (bool): Whether the lexeme is a quotation mark."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_QUOTE)

-        def __set__(self, bint x):
+    @is_quote.setter
+    def is_quote(self, bint x):
        Lexeme.c_set_flag(self.c, IS_QUOTE, x)

-    property is_left_punct:
+    @property
+    def is_left_punct(self):
        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)

-        def __set__(self, bint x):
+    @is_left_punct.setter
+    def is_left_punct(self, bint x):
        Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)

-    property is_right_punct:
+    @property
+    def is_right_punct(self):
        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)

-        def __set__(self, bint x):
+    @is_right_punct.setter
+    def is_right_punct(self, bint x):
        Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)

-    property is_currency:
+    @property
+    def is_currency(self):
        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_CURRENCY)

-        def __set__(self, bint x):
+    @is_currency.setter
+    def is_currency(self, bint x):
        Lexeme.c_set_flag(self.c, IS_CURRENCY, x)

-    property like_url:
+    @property
+    def like_url(self):
        """RETURNS (bool): Whether the lexeme resembles a URL."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, LIKE_URL)

-        def __set__(self, bint x):
+    @like_url.setter
+    def like_url(self, bint x):
        Lexeme.c_set_flag(self.c, LIKE_URL, x)

-    property like_num:
+    @property
+    def like_num(self):
        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
            "10", "ten", etc.
        """
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, LIKE_NUM)

-        def __set__(self, bint x):
+    @like_num.setter
+    def like_num(self, bint x):
        Lexeme.c_set_flag(self.c, LIKE_NUM, x)

-    property like_email:
+    @property
+    def like_email(self):
        """RETURNS (bool): Whether the lexeme resembles an email address."""
-        def __get__(self):
        return Lexeme.c_check_flag(self.c, LIKE_EMAIL)

-        def __set__(self, bint x):
+    @like_email.setter
+    def like_email(self, bint x):
        Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
    def __init__(self, ArcEager moves, StateClass stcls, Example example):
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
-        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
+        labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
        sent_starts = _get_aligned_sent_starts(example)
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
            doc.c[i].head = new_head.i - i
-            doc.c[i].dep = doc.vocab.strings.add(new_label)
+            doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
    set_children_from_heads(doc.c, 0, doc.length)
    return doc

--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -11,7 +11,6 @@ from .. import util
 from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
-from ..ml import empty_kb
 from ..scorer import Scorer
 from ..tokens import Doc, Span
 from ..training import Example, validate_examples, validate_get_examples
@ -105,7 +104,7 @@ def make_entity_linker(
        ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
    scorer (Optional[Callable]): The scoring method.
-    use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
+    use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
        component must provide entity annotations.
    candidates_batch_size (int): Size of batches for entity candidate generation.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
@ -235,7 +234,6 @@ class EntityLinker(TrainablePipe):
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
-        self.scorer = scorer
        self.use_gold_ents = use_gold_ents
        self.candidates_batch_size = candidates_batch_size
        self.threshold = threshold
@ -243,6 +241,37 @@ class EntityLinker(TrainablePipe):
        if candidates_batch_size < 1:
            raise ValueError(Errors.E1044)

+        def _score_with_ents_set(examples: Iterable[Example], **kwargs):
+            # Because of how spaCy works, we can't just score immediately, because Language.evaluate
+            # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
+            if not scorer:
+                return scorer
+            if not self.use_gold_ents:
+                return scorer(examples, **kwargs)
+            else:
+                examples = self._ensure_ents(examples)
+                docs = self.pipe(
+                    (eg.predicted for eg in examples),
+                )
+                for eg, doc in zip(examples, docs):
+                    eg.predicted = doc
+                return scorer(examples, **kwargs)
+
+        self.scorer = _score_with_ents_set
+
+    def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
+        """If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
+        if not self.use_gold_ents:
+            return examples
+
+        new_examples = []
+        for eg in examples:
+            ents, _ = eg.get_aligned_ents_and_ner()
+            new_eg = eg.copy()
+            new_eg.predicted.ents = ents
+            new_examples.append(new_eg)
+        return new_examples
+
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
        create it using this object's vocab."""
@ -284,11 +313,9 @@ class EntityLinker(TrainablePipe):
        nO = self.kb.entity_vector_length
        doc_sample = []
        vector_sample = []
-        for eg in islice(get_examples(), 10):
+        examples = self._ensure_ents(islice(get_examples(), 10))
+        for eg in examples:
            doc = eg.x
-            if self.use_gold_ents:
-                ents, _ = eg.get_aligned_ents_and_ner()
-                doc.ents = ents
            doc_sample.append(doc)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
@ -354,31 +381,17 @@ class EntityLinker(TrainablePipe):
        losses.setdefault(self.name, 0.0)
        if not examples:
            return losses
+        examples = self._ensure_ents(examples)
        validate_examples(examples, "EntityLinker.update")

-        set_dropout_rate(self.model, drop)
-        docs = [eg.predicted for eg in examples]
-        # save to restore later
-        old_ents = [doc.ents for doc in docs]
-
-        for doc, ex in zip(docs, examples):
-            if self.use_gold_ents:
-                ents, _ = ex.get_aligned_ents_and_ner()
-                doc.ents = ents
-            else:
-                # only keep matching ents
-                doc.ents = ex.get_matching_ents()
-
        # make sure we have something to learn from, if not, short-circuit
        if not self.batch_has_learnable_example(examples):
            return losses

+        set_dropout_rate(self.model, drop)
+        docs = [eg.predicted for eg in examples]
        sentence_encodings, bp_context = self.model.begin_update(docs)

-        # now restore the ents
-        for doc, old in zip(docs, old_ents):
-            doc.ents = old
-
        loss, d_scores = self.get_loss(
            sentence_encodings=sentence_encodings, examples=examples
        )
@ -386,11 +399,13 @@ class EntityLinker(TrainablePipe):
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
+
        return losses

    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker.get_loss")
        entity_encodings = []
+        # We assume that get_loss is called with gold ents set in the examples if need be
        eidx = 0  # indices in gold entities to keep
        keep_ents = []  # indices in sentence_encodings to keep

--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -25,5 +25,7 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map

-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) 
+    cdef vector[hash_t] _transient_keys
+    cdef Pool _non_temp_mem
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,9 +1,14 @@
 # cython: infer_types=True
 # cython: profile=False
 cimport cython
+
+from contextlib import contextmanager
+from typing import Iterator, List, Optional
+
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash32, hash64
+from preshed.maps cimport map_clear

 import srsly

@ -119,10 +124,11 @@ cdef class StringStore:
        strings (iterable): A sequence of unicode strings to add to the store.
        """
        self.mem = Pool()
+        self._non_temp_mem = self.mem
        self._map = PreshMap()
        if strings is not None:
            for string in strings:
-                self.add(string)
+                self.add(string, allow_transient=False)

    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.
@ -152,10 +158,13 @@ cdef class StringStore:
                return SYMBOLS_BY_INT[str_hash]
            else:
                utf8str = <Utf8Str*>self._map.get(str_hash)
+                if utf8str is NULL:
+                    raise KeyError(Errors.E018.format(hash_value=string_or_id))
+                else:
+                    return decode_Utf8Str(utf8str)
        else:
            # TODO: Raise an error instead
            utf8str = <Utf8Str*>self._map.get(string_or_id)
-
            if utf8str is NULL:
                raise KeyError(Errors.E018.format(hash_value=string_or_id))
            else:
@ -175,12 +184,46 @@ cdef class StringStore:
        else:
            return self[key]

-    def add(self, string):
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self.keys.size() + self._transient_keys.size()
+
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
+        """Begin a block where all resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        self.mem = mem
+        yield mem
+        for key in self._transient_keys:
+            map_clear(self._map.c_map, key)
+        self._transient_keys.clear()
+        self.mem = self._non_temp_mem
+
+    def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
        """Add a string to the StringStore.

        string (str): The string to add.
+        allow_transient (bool): Allow the string to be stored in the 'transient'
+          map, which will be flushed at the end of the memory zone. Strings
+          encountered during arbitrary text processing should be added
+          with allow_transient=True, while labels and other strings used
+          internally should not.
        RETURNS (uint64): The string's hash value.
        """
+        if allow_transient is None:
+            allow_transient = self.mem is not self._non_temp_mem
        cdef hash_t str_hash
        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
@ -188,22 +231,26 @@ cdef class StringStore:

            string = string.encode("utf8")
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        elif isinstance(string, bytes):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        else:
            raise TypeError(Errors.E017.format(value_type=type(string)))
        return str_hash

    def __len__(self):
        """The number of strings in the store.
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
+        else:
+            return self._intern_str(string, allow_transient)

        RETURNS (int): The number of strings in the store.
        """
-        return self.keys.size()
+        return self.keys.size() + self._transient_keys.size()

    def __contains__(self, string_or_id not None):
        """Check whether a string or ID is in the store.
@ -222,12 +269,17 @@ cdef class StringStore:
            pass
        else:
            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
-
+            if self._map.get(string_or_id) is not NULL:
+                return True
+            else:
+                return False
        if str_hash < len(SYMBOLS_BY_INT):
            return True
        else:
-            return self._map.get(str_hash) is not NULL
+            if self._map.get(str_hash) is not NULL:
+                return True
+            else:
+                return False

    def __iter__(self):
        """Iterate over the strings in the store, in order.
@ -240,12 +292,29 @@ cdef class StringStore:
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        for i in range(self._transient_keys.size()):
+            key = self._transient_keys[i]
+            utf8str = <Utf8Str*>self._map.get(key)
+            yield decode_Utf8Str(utf8str)

    def __reduce__(self):
        strings = list(self)
        return (StringStore, (strings,), None, None, None)

+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        transient_hashes = [None] * self._transient_keys.size()
+        for i in range(self._transient_keys.size()):
+            transient_hashes[i] = self._transient_keys[i]
+        return hashes + transient_hashes
+
    def to_disk(self, path):
        """Save the current state to a directory.

@ -269,7 +338,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self

    def to_bytes(self, **kwargs):
@ -289,23 +358,25 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self

    def _reset_and_load(self, strings):
        self.mem = Pool()
+        self._non_temp_mem = self.mem
        self._map = PreshMap()
        self.keys.clear()
+        self._transient_keys.clear()
        for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)

-    cdef const Utf8Str* intern_unicode(self, str py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+        return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)

    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
@ -314,5 +385,8 @@ cdef class StringStore:
            return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
+        if allow_transient and self.mem is not self._non_temp_mem:
+            self._transient_keys.push_back(key)
+        else:
            self.keys.push_back(key)
        return value
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -81,6 +81,11 @@ def bn_tokenizer():
    return get_lang_class("bn")().tokenizer


+@pytest.fixture(scope="session")
+def bo_tokenizer():
+    return get_lang_class("bo")().tokenizer
+
+
@pytest.fixture(scope="session")
 def ca_tokenizer():
    return get_lang_class("ca")().tokenizer
--- a/spacy/tests/lang/bo/init.py
+++ b/spacy/tests/lang/bo/init.py
--- a/spacy/tests/lang/bo/test_text.py
+++ b/spacy/tests/lang/bo/test_text.py
@ -0,0 +1,21 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("999.0", True),
+        ("གཅིག་", True),
+        ("གཉིས་", True),
+        ("ཀླད་ཀོར་", True),
+        ("བཅུ་གཅིག་", True),
+        ("ཁྱི་", False),
+        (",", False),
+    ],
+)
+def test_lex_attrs_like_number(bo_tokenizer, text, match):
+    tokens = bo_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,27 @@
+import pytest
+
+from spacy.lang.kmr.lex_attrs import like_num
+
+
+@pytest.mark.parametrize(
+    "word",
+    [
+        "yekem",
+        "duyemîn",
+        "100em",
+        "dehem",
+        "sedemîn",
+        "34em",
+        "30yem",
+        "20emîn",
+        "50yemîn",
+    ],
+)
+def test_kmr_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["deh"])
+def test_kmr_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
 # fmt: on


--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -717,7 +717,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on


-def test_overfitting_IO():
+def test_overfitting_IO_gold_entities():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    vector_length = 3
@ -744,7 +744,9 @@ def test_overfitting_IO():
        return mykb

    # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker = nlp.add_pipe(
+        "entity_linker", last=True, config={"use_gold_ents": True}
+    )
    assert isinstance(entity_linker, EntityLinker)
    entity_linker.set_kb(create_kb)
    assert "Q2146908" in entity_linker.vocab.strings
@ -807,6 +809,107 @@ def test_overfitting_IO():
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

+    eval = nlp.evaluate(train_examples)
+    assert "nel_macro_p" in eval
+    assert "nel_macro_r" in eval
+    assert "nel_macro_f" in eval
+    assert "nel_micro_p" in eval
+    assert "nel_micro_r" in eval
+    assert "nel_micro_f" in eval
+    assert "nel_f_per_type" in eval
+    assert "PERSON" in eval["nel_f_per_type"]
+
+    assert eval["nel_macro_f"] > 0
+    assert eval["nel_micro_f"] > 0
+
+
+def test_overfitting_IO_with_ner():
+    # Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
+    nlp = English()
+    vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    # Create the NER and EL components and add them to the pipeline
+    ner = nlp.add_pipe("ner", first=True)
+    entity_linker = nlp.add_pipe(
+        "entity_linker", last=True, config={"use_gold_ents": False}
+    )
+    entity_linker.set_kb(create_kb)
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            ner.add_label(ent[2])
+    optimizer = nlp.initialize()
+
+    # train the NER and NEL pipes
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.001
+    assert losses["entity_linker"] < 0.001
+
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+
+    # test the trained model
+    test_text = "Russ Cochran captured his first major title with his son as caddie."
+    doc = nlp(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "Russ Cochran"
+    assert ents[0].label_ == "PERSON"
+    assert ents[0].kb_id_ != "NIL"
+
+    # TODO: below assert is still flaky - EL doesn't properly overfit quite yet
+    # assert ents[0].kb_id_ == "Q2146908"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        assert nlp2.pipe_names == nlp.pipe_names
+        doc2 = nlp2(test_text)
+        ents2 = doc2.ents
+        assert len(ents2) == 1
+        assert ents2[0].text == "Russ Cochran"
+        assert ents2[0].label_ == "PERSON"
+        assert ents2[0].kb_id_ != "NIL"
+
+    eval = nlp.evaluate(train_examples)
+    assert "nel_macro_f" in eval
+    assert "nel_micro_f" in eval
+    assert "ents_f" in eval
+    assert "nel_f_per_type" in eval
+    assert "ents_per_type" in eval
+    assert "PERSON" in eval["nel_f_per_type"]
+    assert "PERSON" in eval["ents_per_type"]
+
+    assert eval["nel_macro_f"] > 0
+    assert eval["nel_micro_f"] > 0
+    assert eval["ents_f"] > 0
+

 def test_kb_serialization():
    # Test that the KB can be used in a pipeline with a different vocab
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process):
        nlp.set_error_handler(raise_error)
        with pytest.raises(ValueError):
            list(nlp.pipe(texts, n_process=n_process))
-        # set explicitely to ignoring
+        # set explicitly to ignoring
        nlp.set_error_handler(ignore_error)
        docs = list(nlp.pipe(texts, n_process=n_process))
        assert len(docs) == 0
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -18,6 +18,7 @@ LANGUAGES = [
    pytest.param("ar", marks=pytest.mark.slow()),
    pytest.param("bg", marks=pytest.mark.slow()),
    "bn",
+    pytest.param("bo", marks=pytest.mark.slow()),
    pytest.param("ca", marks=pytest.mark.slow()),
    pytest.param("cs", marks=pytest.mark.slow()),
    pytest.param("da", marks=pytest.mark.slow()),
@ -57,6 +58,7 @@ LANGUAGES = [
    pytest.param("tr", marks=pytest.mark.slow()),
    pytest.param("tt", marks=pytest.mark.slow()),
    pytest.param("ur", marks=pytest.mark.slow()),
+    pytest.param("kmr", marks=pytest.mark.slow()),
 ]


--- a/spacy/tests/vocab_vectors/test_memory_zone.py
+++ b/spacy/tests/vocab_vectors/test_memory_zone.py
@ -0,0 +1,36 @@
+from spacy.vocab import Vocab
+
+
+def test_memory_zone_no_insertion():
+    vocab = Vocab()
+    with vocab.memory_zone():
+        pass
+    lex = vocab["horse"]
+    assert lex.text == "horse"
+
+
+def test_memory_zone_insertion():
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+    assert "dog" in vocab
+    assert "horse" not in vocab
+
+
+def test_memory_zone_redundant_insertion():
+    """Test that if we insert an already-existing word while
+    in the memory zone, it stays persistent"""
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+        _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -25,9 +25,7 @@ cdef class Tokenizer:
    cdef PhraseMatcher _special_matcher
    # TODO convert to bool in v4
    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
-    # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int2
+    cdef public int max_cache_size

    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -30,7 +30,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

        vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
        faster_heuristics (bool): Whether to restrict the final
            Matcher-based pass for rules to those containing affixes or space.
            Defaults to True.
+        max_cache_size (int): Maximum number of tokenization chunks to cache.

        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -69,52 +70,59 @@ cdef class Tokenizer:
        self._rules = {}
        self._special_matcher = PhraseMatcher(self.vocab)
        self._load_special_cases(rules)
+        self.max_cache_size = max_cache_size

-    property token_match:
-        def __get__(self):
+    @property
+    def token_match(self):
        return self._token_match

-        def __set__(self, token_match):
+    @token_match.setter
+    def token_match(self, token_match):
        self._token_match = token_match
        self._reload_special_cases()

-    property url_match:
-        def __get__(self):
+    @property
+    def url_match(self):
        return self._url_match

-        def __set__(self, url_match):
+    @url_match.setter
+    def url_match(self, url_match):
        self._url_match = url_match
        self._reload_special_cases()

-    property prefix_search:
-        def __get__(self):
+    @property
+    def prefix_search(self):
        return self._prefix_search

-        def __set__(self, prefix_search):
+    @prefix_search.setter
+    def prefix_search(self, prefix_search):
        self._prefix_search = prefix_search
        self._reload_special_cases()

-    property suffix_search:
-        def __get__(self):
+    @property
+    def suffix_search(self):
        return self._suffix_search

-        def __set__(self, suffix_search):
+    @suffix_search.setter
+    def suffix_search(self, suffix_search):
        self._suffix_search = suffix_search
        self._reload_special_cases()

-    property infix_finditer:
-        def __get__(self):
+    @property
+    def infix_finditer(self):
        return self._infix_finditer

-        def __set__(self, infix_finditer):
+    @infix_finditer.setter
+    def infix_finditer(self, infix_finditer):
        self._infix_finditer = infix_finditer
        self._reload_special_cases()

-    property rules:
-        def __get__(self):
+    @property
+    def rules(self):
        return self._rules

-        def __set__(self, rules):
+    @rules.setter
+    def rules(self, rules):
        self._rules = {}
        self._flush_cache()
        self._flush_specials()
@ -122,11 +130,12 @@ cdef class Tokenizer:
        self._specials = PreshMap()
        self._load_special_cases(rules)

-    property faster_heuristics:
-        def __get__(self):
+    @property
+    def faster_heuristics(self):
        return bool(self._faster_heuristics)

-        def __set__(self, faster_heuristics):
+    @faster_heuristics.setter
+    def faster_heuristics(self, faster_heuristics):
        self._faster_heuristics = bool(faster_heuristics)
        self._reload_special_cases()

@ -390,6 +399,7 @@ cdef class Tokenizer:
                                   has_special, with_special_cases)
        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                            with_special_cases)
+        if len(self._cache) < self.max_cache_size:
            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                              tokens.length - orig_size)

@ -507,8 +517,7 @@ cdef class Tokenizer:
        if n <= 0:
            # avoid mem alloc of zero length
            return 0
-        for i in range(n):
-            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
+        if self.vocab.in_memory_zone:
            return 0
        # See #1250
        if has_special[0]:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -667,7 +667,8 @@ cdef class Doc:
        else:
            return False

-    property vector:
+    @property
+    def vector(self):
        """A real-valued meaning representation. Defaults to an average of the
        token vectors.

@ -676,7 +677,6 @@ cdef class Doc:

        DOCS: https://spacy.io/api/doc#vector
        """
-        def __get__(self):
        if "vector" in self.user_hooks:
            return self.user_hooks["vector"](self)
        if self._vector is not None:
@ -694,17 +694,18 @@ cdef class Doc:
        else:
            return xp.zeros((self.vocab.vectors_length,), dtype="float32")

-        def __set__(self, value):
+    @vector.setter
+    def vector(self, value):
        self._vector = value

-    property vector_norm:
+    @property
+    def vector_norm(self):
        """The L2 norm of the document's vector representation.

        RETURNS (float): The L2 norm of the vector representation.

        DOCS: https://spacy.io/api/doc#vector_norm
        """
-        def __get__(self):
        if "vector_norm" in self.user_hooks:
            return self.user_hooks["vector_norm"](self)
        cdef float value
@ -716,7 +717,8 @@ cdef class Doc:
            self._vector_norm = sqrt(norm) if norm != 0 else 0
        return self._vector_norm

-        def __set__(self, value):
+    @vector_norm.setter
+    def vector_norm(self, value):
        self._vector_norm = value

    @property
@ -736,7 +738,8 @@ cdef class Doc:
        """
        return self.text

-    property ents:
+    @property
+    def ents(self):
        """The named entities in the document. Returns a tuple of named entity
        `Span` objects, if the entity recognizer has been applied.

@ -744,7 +747,6 @@ cdef class Doc:

        DOCS: https://spacy.io/api/doc#ents
        """
-        def __get__(self):
        cdef int i
        cdef const TokenC* token
        cdef int start = -1
@ -779,7 +781,8 @@ cdef class Doc:
        output = [o for o in output if o.label_ != ""]
        return tuple(output)

-        def __set__(self, ents):
+    @ents.setter
+    def ents(self, ents):
        # TODO:
        # 1. Test basic data-driven ORTH gazetteer
        # 2. Test more nuanced date and currency regex
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -757,77 +757,86 @@ cdef class Span:
        for word in self.rights:
            yield from word.subtree

-    property start:
-        def __get__(self):
+    @property
+    def start(self):
        return self.c.start

-        def __set__(self, int start):
+    @start.setter
+    def start(self, int start):
        if start < 0:
            raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
        self.c.start = start

-    property end:
-        def __get__(self):
+    @property
+    def end(self):
        return self.c.end

-        def __set__(self, int end):
+    @end.setter
+    def end(self, int end):
        if end < 0:
            raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
        self.c.end = end

-    property start_char:
-        def __get__(self):
+    @property
+    def start_char(self):
        return self.c.start_char

-        def __set__(self, int start_char):
+    @start_char.setter
+    def start_char(self, int start_char):
        if start_char < 0:
            raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
        self.c.start_char = start_char

-    property end_char:
-        def __get__(self):
+    @property
+    def end_char(self):
        return self.c.end_char

-        def __set__(self, int end_char):
+    @end_char.setter
+    def end_char(self, int end_char):
        if end_char < 0:
            raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
        self.c.end_char = end_char

-    property label:
-        def __get__(self):
+    @property
+    def label(self):
        return self.c.label

-        def __set__(self, attr_t label):
+    @label.setter
+    def label(self, attr_t label):
        self.c.label = label

-    property kb_id:
-        def __get__(self):
+    @property
+    def kb_id(self):
        return self.c.kb_id

-        def __set__(self, attr_t kb_id):
+    @kb_id.setter
+    def kb_id(self, attr_t kb_id):
        self.c.kb_id = kb_id

-    property id:
-        def __get__(self):
+    @property
+    def id(self):
        return self.c.id

-        def __set__(self, attr_t id):
+    @id.setter
+    def id(self, attr_t id):
        self.c.id = id

-    property ent_id:
+    @property
+    def ent_id(self):
        """RETURNS (uint64): The entity ID."""
-        def __get__(self):
        return self.root.ent_id

-        def __set__(self, hash_t key):
+    @ent_id.setter
+    def ent_id(self, hash_t key):
        raise NotImplementedError(Errors.E200.format(attr="ent_id"))

-    property ent_id_:
+    @property
+    def ent_id_(self):
        """RETURNS (str): The (string) entity ID."""
-        def __get__(self):
        return self.root.ent_id_

-        def __set__(self, str key):
+    @ent_id_.setter
+    def ent_id_(self, str key):
        raise NotImplementedError(Errors.E200.format(attr="ent_id_"))

    @property
@ -843,28 +852,31 @@ cdef class Span:
        """RETURNS (str): The span's lemma."""
        return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()

-    property label_:
+    @property
+    def label_(self):
        """RETURNS (str): The span's label."""
-        def __get__(self):
        return self.doc.vocab.strings[self.label]

-        def __set__(self, str label_):
+    @label_.setter
+    def label_(self, str label_):
        self.label = self.doc.vocab.strings.add(label_)

-    property kb_id_:
+    @property
+    def kb_id_(self):
        """RETURNS (str): The span's KB ID."""
-        def __get__(self):
        return self.doc.vocab.strings[self.kb_id]

-        def __set__(self, str kb_id_):
+    @kb_id_.setter
+    def kb_id_(self, str kb_id_):
        self.kb_id = self.doc.vocab.strings.add(kb_id_)

-    property id_:
+    @property
+    def id_(self):
        """RETURNS (str): The span's ID."""
-        def __get__(self):
        return self.doc.vocab.strings[self.id]

-        def __set__(self, str id_):
+    @id_.setter
+    def id_(self, str id_):
        self.id = self.doc.vocab.strings.add(id_)


--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -249,11 +249,12 @@ cdef class Token:
        """
        return not self.c.morph == 0

-    property morph:
-        def __get__(self):
+    @property
+    def morph(self):
        return MorphAnalysis.from_id(self.vocab, self.c.morph)

-        def __set__(self, MorphAnalysis morph):
+    @morph.setter
+    def morph(self, MorphAnalysis morph):
        # Check that the morph has the same vocab
        if self.vocab != morph.vocab:
            raise ValueError(Errors.E1013)
@ -377,38 +378,42 @@ cdef class Token:
        """
        return self.c.lex.suffix

-    property lemma:
+    @property
+    def lemma(self):
        """RETURNS (uint64): ID of the base form of the word, with no
            inflectional suffixes.
        """
-        def __get__(self):
        return self.c.lemma

-        def __set__(self, attr_t lemma):
+    @lemma.setter
+    def lemma(self, attr_t lemma):
        self.c.lemma = lemma

-    property pos:
+    @property
+    def pos(self):
        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
-        def __get__(self):
        return self.c.pos

-        def __set__(self, pos):
+    @pos.setter
+    def pos(self, pos):
        self.c.pos = pos

-    property tag:
+    @property
+    def tag(self):
        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
-        def __get__(self):
        return self.c.tag

-        def __set__(self, attr_t tag):
+    @tag.setter
+    def tag(self, attr_t tag):
        self.c.tag = tag

-    property dep:
+    @property
+    def dep(self):
        """RETURNS (uint64): ID of syntactic dependency label."""
-        def __get__(self):
        return self.c.dep

-        def __set__(self, attr_t label):
+    @dep.setter
+    def dep(self, attr_t label):
        self.c.dep = label

    @property
@ -494,8 +499,8 @@ cdef class Token:
            return self.doc.user_token_hooks["sent"](self)
        return self.doc[self.i : self.i+1].sent

-    property sent_start:
-        def __get__(self):
+    @property
+    def sent_start(self):
        """Deprecated: use Token.is_sent_start instead."""
        # Raising a deprecation warning here causes errors for autocomplete
        # Handle broken backwards compatibility case: doc[0].sent_start
@ -505,17 +510,18 @@ cdef class Token:
        else:
            return self.c.sent_start

-        def __set__(self, value):
+    @sent_start.setter
+    def sent_start(self, value):
        self.is_sent_start = value

-    property is_sent_start:
+    @property
+    def is_sent_start(self):
        """A boolean value indicating whether the token starts a sentence.
        `None` if unknown. Defaults to `True` for the first token in the `Doc`.

        RETURNS (bool / None): Whether the token starts a sentence.
            None if unknown.
        """
-        def __get__(self):
        if self.c.sent_start == 0:
            return None
        elif self.c.sent_start < 0:
@ -523,7 +529,8 @@ cdef class Token:
        else:
            return True

-        def __set__(self, value):
+    @is_sent_start.setter
+    def is_sent_start(self, value):
        if self.doc.has_annotation("DEP"):
            raise ValueError(Errors.E043)
        if value is None:
@ -535,7 +542,8 @@ cdef class Token:
        else:
            raise ValueError(Errors.E044.format(value=value))

-    property is_sent_end:
+    @property
+    def is_sent_end(self):
        """A boolean value indicating whether the token ends a sentence.
        `None` if unknown. Defaults to `True` for the last token in the `Doc`.

@ -544,7 +552,6 @@ cdef class Token:

        DOCS: https://spacy.io/api/token#is_sent_end
        """
-        def __get__(self):
        if self.i + 1 == len(self.doc):
            return True
        elif self.doc[self.i+1].is_sent_start is None:
@ -554,7 +561,8 @@ cdef class Token:
        else:
            return False

-        def __set__(self, value):
+    @is_sent_end.setter
+    def is_sent_end(self, value):
        raise ValueError(Errors.E196)

    @property
@ -682,20 +690,21 @@ cdef class Token:
        """
        return not Token.missing_head(self.c)

-    property head:
+    @property
+    def head(self):
        """The syntactic parent, or "governor", of this token.
        If token.has_head() is `False`, this method will return itself.

        RETURNS (Token): The token predicted by the parser to be the head of
            the current token.
        """
-        def __get__(self):
        if not self.has_head():
            return self
        else:
            return self.doc[self.i + self.c.head]

-        def __set__(self, Token new_head):
+    @head.setter
+    def head(self, Token new_head):
        # This function sets the head of self to new_head and updates the
        # counters for left/right dependents and left/right corner for the
        # new and the old head
@ -744,20 +753,22 @@ cdef class Token:
                    queue.append(child)
        return tuple([w for w in output if w.i != self.i])

-    property ent_type:
+    @property
+    def ent_type(self):
        """RETURNS (uint64): Named entity type."""
-        def __get__(self):
        return self.c.ent_type

-        def __set__(self, ent_type):
+    @ent_type.setter
+    def ent_type(self, ent_type):
        self.c.ent_type = ent_type

-    property ent_type_:
+    @property
+    def ent_type_(self):
        """RETURNS (str): Named entity type."""
-        def __get__(self):
        return self.vocab.strings[self.c.ent_type]

-        def __set__(self, ent_type):
+    @ent_type_.setter
+    def ent_type_(self, ent_type):
        self.c.ent_type = self.vocab.strings.add(ent_type)

    @property
@ -784,40 +795,44 @@ cdef class Token:
        """
        return self.iob_strings()[self.c.ent_iob]

-    property ent_id:
+    @property
+    def ent_id(self):
        """RETURNS (uint64): ID of the entity the token is an instance of,
            if any.
        """
-        def __get__(self):
        return self.c.ent_id

-        def __set__(self, hash_t key):
+    @ent_id.setter
+    def ent_id(self, hash_t key):
        self.c.ent_id = key

-    property ent_id_:
+    @property
+    def ent_id_(self):
        """RETURNS (str): ID of the entity the token is an instance of,
            if any.
        """
-        def __get__(self):
        return self.vocab.strings[self.c.ent_id]

-        def __set__(self, name):
+    @ent_id_.setter
+    def ent_id_(self, name):
        self.c.ent_id = self.vocab.strings.add(name)

-    property ent_kb_id:
+    @property
+    def ent_kb_id(self):
        """RETURNS (uint64): Named entity KB ID."""
-        def __get__(self):
        return self.c.ent_kb_id

-        def __set__(self, attr_t ent_kb_id):
+    @ent_kb_id.setter
+    def ent_kb_id(self, attr_t ent_kb_id):
        self.c.ent_kb_id = ent_kb_id

-    property ent_kb_id_:
+    @property
+    def ent_kb_id_(self):
        """RETURNS (str): Named entity KB ID."""
-        def __get__(self):
        return self.vocab.strings[self.c.ent_kb_id]

-        def __set__(self, ent_kb_id):
+    @ent_kb_id_.setter
+    def ent_kb_id_(self, ent_kb_id):
        self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)

    @property
@ -840,15 +855,16 @@ cdef class Token:
        """
        return self.vocab.strings[self.c.lex.lower]

-    property norm_:
+    @property
+    def norm_(self):
        """RETURNS (str): The token's norm, i.e. a normalised form of the
            token text. Usually set in the language's tokenizer exceptions or
            norm exceptions.
        """
-        def __get__(self):
        return self.vocab.strings[self.norm]

-        def __set__(self, str norm_):
+    @norm_.setter
+    def norm_(self, str norm_):
        self.c.norm = self.vocab.strings.add(norm_)

    @property
@ -879,32 +895,35 @@ cdef class Token:
        """
        return self.vocab.strings[self.c.lex.lang]

-    property lemma_:
+    @property
+    def lemma_(self):
        """RETURNS (str): The token lemma, i.e. the base form of the word,
            with no inflectional suffixes.
        """
-        def __get__(self):
        return self.vocab.strings[self.c.lemma]

-        def __set__(self, str lemma_):
+    @lemma_.setter
+    def lemma_(self, str lemma_):
        self.c.lemma = self.vocab.strings.add(lemma_)

-    property pos_:
+    @property
+    def pos_(self):
        """RETURNS (str): Coarse-grained part-of-speech tag."""
-        def __get__(self):
        return parts_of_speech.NAMES[self.c.pos]

-        def __set__(self, pos_name):
+    @pos_.setter
+    def pos_(self, pos_name):
        if pos_name not in parts_of_speech.IDS:
            raise ValueError(Errors.E1021.format(pp=pos_name))
        self.c.pos = parts_of_speech.IDS[pos_name]

-    property tag_:
+    @property
+    def tag_(self):
        """RETURNS (str): Fine-grained part-of-speech tag."""
-        def __get__(self):
        return self.vocab.strings[self.c.tag]

-        def __set__(self, tag):
+    @tag_.setter
+    def tag_(self, tag):
        self.tag = self.vocab.strings.add(tag)

    def has_dep(self):
@ -915,12 +934,13 @@ cdef class Token:
        """
        return not Token.missing_dep(self.c)

-    property dep_:
+    @property
+    def dep_(self):
        """RETURNS (str): The syntactic dependency label."""
-        def __get__(self):
        return self.vocab.strings[self.c.dep]

-        def __set__(self, str label):
+    @dep_.setter
+    def dep_(self, str label):
        self.c.dep = self.vocab.strings.add(label)

    @property
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -88,20 +88,22 @@ cdef class Example:
    def __len__(self):
        return len(self.predicted)

-    property predicted:
-        def __get__(self):
+    @property
+    def predicted(self):
        return self.x

-        def __set__(self, doc):
+    @predicted.setter
+    def predicted(self, doc):
        self.x = doc
        self._cached_alignment = None
        self._cached_words_x = [t.text for t in doc]

-    property reference:
-        def __get__(self):
+    @property
+    def reference(self):
        return self.y

-        def __set__(self, doc):
+    @reference.setter
+    def reference(self, doc):
        self.y = doc
        self._cached_alignment = None
        self._cached_words_y = [t.text for t in doc]
@ -420,8 +422,8 @@ cdef class Example:
                seen_indices.update(indices)
        return output

-    property text:
-        def __get__(self):
+    @property
+    def text(self):
        return self.x.text

    def __str__(self):
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -41,7 +41,9 @@ cdef class Vocab:
    cdef const TokenC* make_fused_token(self, substrings) except NULL

    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL

    cdef PreshMap _by_orth
+    cdef Pool _non_temp_mem
+    cdef vector[attr_t] _transient_orths
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -1,6 +1,8 @@
+from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union

+from cymem.cymem import Pool
 from thinc.types import Floats1d, FloatsXd

 from . import Language
@ -67,6 +69,8 @@ class Vocab:
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
    ) -> Vocab: ...
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...

 def pickle_vocab(vocab: Vocab) -> Any: ...
 def unpickle_vocab(
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,8 +1,11 @@
 import functools
+from contextlib import ExitStack, contextmanager
+from typing import Iterator, Optional

 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
+from preshed.maps cimport map_clear

 from .attrs cimport LANG, ORTH
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,15 +90,22 @@ cdef class Vocab:
        self.lookups = lookups
        self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
+        # During a memory_zone we replace our mem object with one
+        # that's passed to us. We keep a reference to our non-temporary
+        # memory here, in case we need to make an allocation we want to
+        # guarantee is not temporary. This is also how we check whether
+        # we're in a memory zone: we check whether self.mem is self._non_temp_mem
+        self._non_temp_mem = self.mem

-    property vectors:
-        def __get__(self):
+    @property
+    def vectors(self):
        return self._vectors

-        def __set__(self, vectors):
+    @vectors.setter
+    def vectors(self, vectors):
        if hasattr(vectors, "strings"):
            for s in vectors.strings:
-                    self.strings.add(s)
+                self.strings.add(s, allow_transient=False)
        self._vectors = vectors
        self._vectors.strings = self.strings

@ -106,6 +116,10 @@ cdef class Vocab:
            langfunc = self.lex_attr_getters.get(LANG, None)
        return langfunc("_") if langfunc else ""

+    @property
+    def in_memory_zone(self) -> bool:
+        return self.mem is not self._non_temp_mem
+
    def __len__(self):
        """The current number of lexemes stored.

@ -113,6 +127,33 @@ cdef class Vocab:
        """
        return self.length

+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
+        """Begin a block where resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        # The ExitStack allows programmatic nested context managers.
+        # We don't know how many we need, so it would be awkward to have
+        # them as nested blocks.
+        with ExitStack() as stack:
+            contexts = [stack.enter_context(self.strings.memory_zone(mem))]
+            if hasattr(self.morphology, "memory_zone"):
+                contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
+            if hasattr(self._vectors, "memory_zone"):
+                contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
+            self.mem = mem
+            yield mem
+        self._clear_transient_orths()
+        self.mem = self._non_temp_mem
+
    def add_flag(self, flag_getter, int flag_id=-1):
        """Set a new boolean flag to words in the vocabulary.

@ -147,8 +188,7 @@ cdef class Vocab:

    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
-        `Lexeme` if necessary using memory acquired from the given pool. If the
-        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
+        `Lexeme` if necessary.
        """
        if string == "":
            return &EMPTY_LEXEME
@ -179,19 +219,11 @@ cdef class Vocab:
            return self._new_lexeme(mem, self.strings[orth])

    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
-        # was originally supposed to work. The best solution to the growing
-        # memory use is to periodically reset the vocab, which is an action
-        # that should be up to the user to do (so we don't need to keep track
-        # of the doc ownership).
-        # TODO: Change the C API so that the mem isn't passed in here.
+        # The mem argument is deprecated, replaced by memory zones. Same with
+        # this size heuristic.
        mem = self.mem
-        # if len(string) < 3 or self.length < 10000:
-        #    mem = self.mem
-        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
-        lex.orth = self.strings.add(string)
+        lex.orth = self.strings.add(string, allow_transient=True)
        lex.length = len(string)
        if self.vectors is not None and hasattr(self.vectors, "key2row"):
            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -201,18 +233,25 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, str):
-                    value = self.strings.add(value)
+                    value = self.strings.add(value, allow_transient=True)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
-            self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex

-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1
+        if is_transient and self.in_memory_zone:
+            self._transient_orths.push_back(lex.orth)
+
+    def _clear_transient_orths(self):
+        """Remove transient lexemes from the index (generally at the end of the memory zone)"""
+        for orth in self._transient_orths:
+            map_clear(self._by_orth.c_map, orth)
+        self._transient_orths.clear()

    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.
@ -264,7 +303,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if isinstance(id_or_string, str):
-            orth = self.strings.add(id_or_string)
+            orth = self.strings.add(id_or_string, allow_transient=True)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -416,7 +455,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#get_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.has_vector(key):
@ -435,7 +474,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#set_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=False)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.vectors.is_full and key not in self.vectors:
@ -459,16 +498,17 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#has_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        return key in self.vectors

-    property lookups:
-        def __get__(self):
+    @property
+    def lookups(self):
        return self._lookups

-        def __set__(self, lookups):
+    @lookups.setter
+    def lookups(self, lookups):
        self._lookups = lookups
        if lookups.has_table("lexeme_norm"):
            self.lex_attr_getters[NORM] = util.add_lookups(
--- a/website/docs/api/attributes.mdx
+++ b/website/docs/api/attributes.mdx
@ -46,10 +46,10 @@ as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
 appending `_` as in `token.dep_`.

 | Attribute    | Description                                                                                                                                                    |
-| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `DEP`        | The token's dependency label. ~~str~~                                                                                                                          |
 | `ENT_ID`     | The token's entity ID (`ent_id`). ~~str~~                                                                                                                      |
-| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
+| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
 | `ENT_KB_ID`  | The token's entity knowledge base ID. ~~str~~                                                                                                                  |
 | `ENT_TYPE`   | The token's entity label. ~~str~~                                                                                                                              |
 | `IS_ALPHA`   | Token text consists of alphabetic characters. ~~bool~~                                                                                                         |
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
 (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
 ✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
+✔ Examples without occurrences available for all labels
 ✔ No entities consisting of or starting/ending with whitespace

 =========================== Part-of-speech Tagging ===========================
@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]

 ## find-threshold {id="find-threshold",version="3.5",tag="command"}

-Runs prediction trials for a trained model with varying tresholds to maximize
+Runs prediction trials for a trained model with varying thresholds to maximize
 the specified metric. The search space for the threshold is traversed linearly
 from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
 (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters.
 | `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
 | `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
 | `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
+| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~                                                                                                                            |
 | `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
 | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
 | `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
 | `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
 | `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
-| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
+| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                     |

 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -101,7 +101,7 @@ custom knowledge base, you should either call
 [`initialize`](/api/entitylinker#initialize) call.

 | Name                                     | Description                                                                                                                                                                                                                                                                                  |
-| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                             |
 | `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                                                                                                                                                                                    |
 | `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                          |
@ -114,7 +114,7 @@ custom knowledge base, you should either call
 | `incl_context`                           | Whether or not to include the local context in the model. ~~bool~~                                                                                                                                                                                                                           |
 | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                     |
 | `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                      |
-| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |

 ## EntityLinker.\_\_call\_\_ {id="call",tag="method"}

--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using
 [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
 with `overwrite_ents=True`, existing entities will be replaced if they overlap
 with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
+longer patterns over shorter, and if equal the match occurring first in the Doc
 is chosen.

 > #### Example
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@ -148,8 +148,9 @@ Whether a feature/value pair is in the analysis.
 > ```

 | Name         | Description                                                           |
-| ----------- | --------------------------------------------- |
-| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
+| ------------ | --------------------------------------------------------------------- |
+| `feature`    | A feature/value pair. ~~str~~                                         |
+| **RETURNS**  | Whether the feature/value pair is contained in the analysis. ~~bool~~ |

 ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}

--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative
 clauses.

 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
-has not been implemeted for the given language, a `NotImplementedError` is
+has not been implemented for the given language, a `NotImplementedError` is
 raised.

 > #### Example
--- a/website/docs/api/transformer.mdx
+++ b/website/docs/api/transformer.mdx
@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
 | `align`        | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                 |
 | `width`        | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                          |

-### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
+### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}

 Create an empty `TransformerData` container.

--- a/website/docs/usage/layers-architectures.mdx
+++ b/website/docs/usage/layers-architectures.mdx
@ -832,7 +832,7 @@ retrieve and add to them.

 After creation, the component needs to be
 [initialized](/usage/training#initialization). This method can define the
-relevant labels in two ways: explicitely by setting the `labels` argument in the
+relevant labels in two ways: explicitly by setting the `labels` argument in the
 [`initialize` block](/api/data-formats#config-initialize) of the config, or
 implicately by deducing them from the `get_examples` callback that generates the
 full **training data set**, or a representative sample.
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@ -1899,7 +1899,7 @@ the two words.
    "Shore": ("coast", 0.732257),
    "Precautionary": ("caution", 0.490973),
    "hopelessness": ("sadness", 0.742366),
-    "Continous": ("continuous", 0.732549),
+    "Continuous": ("continuous", 0.732549),
    "Disemboweled": ("corpse", 0.499432),
    "biostatistician": ("scientist", 0.339724),
    "somewheres": ("somewheres", 0.402736),
--- a/website/docs/usage/projects.mdx
+++ b/website/docs/usage/projects.mdx
@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
 dependency check, set `check_requirements: false` in your project's
 `project.yml`.

-### 4. Run a workflow {id="run-workfow"}
+### 4. Run a workflow {id="run-workflow"}

 > #### project.yml
 >
@ -286,7 +286,7 @@ pipelines.
 | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `title`                                             | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | `description`                                       | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `vars`                                              | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                       |
+| `vars`                                              | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                      |
 | `env`                                               | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`.                                                                                                                                                                                                                                                                                          |
 | `directories`                                       | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
 | `assets`                                            | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@ -306,7 +306,9 @@ installed in the same environment – that's it.

 ### Loading probability tables into existing models

-You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
+You can load a probability table from
+[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
+existing spaCy model like `en_core_web_sm`.

 ```python
 # Requirements: pip install spacy-lookups-data
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
 nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
 ```

-When training a model from scratch you can also specify probability tables in the `config.cfg`.
+When training a model from scratch you can also specify probability tables in
+the `config.cfg`.

 ```ini {title="config.cfg (excerpt)"}
 [initialize.lookups]
@ -346,8 +349,8 @@ them**!
 To stick with the theme of
 [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
 consider the following custom spaCy
-[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
-snake when it's called:
+[pipeline component](/usage/processing-pipelines#custom-components) that prints
+a snake when it's called:

 > #### Package directory structure
 >
--- a/website/docs/usage/v2-2.mdx
+++ b/website/docs/usage/v2-2.mdx
@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
 (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
 ✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
+✔ Examples without occurrences available for all labels
 ✔ No entities consisting of or starting/ending with whitespace

 =========================== Part-of-speech Tagging ===========================
--- a/website/docs/usage/v3-2.mdx
+++ b/website/docs/usage/v3-2.mdx
@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
 `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
 in the [transformer API docs](/api/architectures#TransformerModel).

-`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
+`spacy-transformers` v1.1 also adds support for `transformer_config` settings
 such as `output_attentions`. Additional output is stored under
 `TransformerData.model_output`. More details are in the
 [TransformerModel docs](/api/architectures#TransformerModel). The training speed
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -31,6 +31,12 @@
            "name": "Bengali",
            "has_examples": true
        },
+        {
+            "code": "bo",
+            "name": "Tibetan",
+            "example": "འདི་ཚིག་གྲུབ་རེད།",
+            "has_examples": true
+        },
        {
            "code": "ca",
            "name": "Catalan",
@ -480,6 +486,12 @@
            ],
            "example": "这是一个用于示例的句子。",
            "has_examples": true
+        },
+        {
+            "code": "kmr",
+            "name": "Kurdish Kurmanji",
+            "example": "Ev hevokek e",
+            "has_examples": true
        }
    ],
    "licenses": [
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }

 const navAlert = (
-    <Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
-        💥 Interested in <strong>Premium spaCy Models</strong>?
+    <Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
+        💥 <strong>New:</strong> Case study with S&P Global
    </Link>
 )