Merge branch 'master' into pr/13418

2025-07-11 08:42:28 +03:00 · 2024-09-10 14:27:01 +02:00 · 2024-09-10 14:27:01 +02:00 · 37dd13a96b
commit 37dd13a96b
parent a8bf7d9036 7fbbb2002a
69 changed files with 5515 additions and 1063 deletions
--- a/.github/workflows/cibuildwheel.yml
+++ b/.github/workflows/cibuildwheel.yml
@ -0,0 +1,92 @@
 name: Build
 on:
  push:
    tags:
      # ytf did they invent their own syntax that's almost regex?
      # ** matches 'zero or more of any character'
      - 'release-v[0-9]+.[0-9]+.[0-9]+**'
      - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
 jobs:
  build_wheels:
    name: Build wheels on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        # macos-13 is an intel runner, macos-14 is apple silicon
        os: [ubuntu-latest, windows-latest, macos-13]
    steps:
      - uses: actions/checkout@v4
      - name: Build wheels
        uses: pypa/cibuildwheel@v2.19.1
        env:
          CIBW_SOME_OPTION: value
        with:
          package-dir: .
          output-dir: wheelhouse
          config-file: "{package}/pyproject.toml"
      - uses: actions/upload-artifact@v4
        with:
          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
          path: ./wheelhouse/*.whl
  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Build sdist
        run: pipx run build --sdist
      - uses: actions/upload-artifact@v4
        with:
          name: cibw-sdist
          path: dist/*.tar.gz
  create_release:
    needs: [build_wheels, build_sdist]
    runs-on: ubuntu-latest
    permissions:
      contents: write
      checks: write
      actions: read
      issues: read
      packages: write
      pull-requests: read
      repository-projects: read
      statuses: read
    steps:
      - name: Get the tag name and determine if it's a prerelease
        id: get_tag_info
        run: |
          FULL_TAG=${GITHUB_REF#refs/tags/}
          if [[ $FULL_TAG == release-* ]]; then
            TAG_NAME=${FULL_TAG#release-}
            IS_PRERELEASE=false
          elif [[ $FULL_TAG == prerelease-* ]]; then
            TAG_NAME=${FULL_TAG#prerelease-}
            IS_PRERELEASE=true
          else
            echo "Tag does not match expected patterns" >&2
            exit 1
          fi
          echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
          echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
          echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
      - uses: actions/download-artifact@v4
        with:
          # unpacks all CIBW artifacts into dist/
          pattern: cibw-*
          path: dist
          merge-multiple: true
      - name: Create Draft Release
        id: create_release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          name: ${{ env.TAG_NAME }}
          draft: true
          prerelease: ${{ env.IS_PRERELEASE }}
          files: "./dist/*" 
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -15,7 +15,7 @@ jobs:
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
--- a/.github/workflows/gputests.yml.disabled
+++ b/.github/workflows/gputests.yml.disabled
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -16,7 +16,7 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: dessant/lock-threads@v4
+      - uses: dessant/lock-threads@v5
        with:
          process-only: 'issues'
          issue-inactive-days: '30'
--- a/.github/workflows/publish_pypi.yml
+++ b/.github/workflows/publish_pypi.yml
@ -0,0 +1,29 @@
 # The cibuildwheel action triggers on creation of a release, this
 # triggers on publication.
 # The expected workflow is to create a draft release and let the wheels
 # upload, and then hit 'publish', which uploads to PyPi.
 on:
  release:
    types:
      - published
 jobs:
  upload_pypi:
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/spacy
    permissions:
      id-token: write
      contents: read
    if: github.event_name == 'release' && github.event.action == 'published'
    # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
    # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
    steps:
      - uses: robinraju/release-downloader@v1
        with:
          tag: ${{ github.event.release.tag_name }}
          fileName: '*'
          out-file-path: 'dist'
      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/slowtests.yml.disabled
+++ b/.github/workflows/slowtests.yml.disabled
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -18,7 +18,7 @@ jobs:
        run: |
          echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: '3.10'
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -25,13 +25,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
          architecture: x64
      - name: black
        run: |
@ -75,13 +74,12 @@ jobs:
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          architecture: x64
      - name: Install dependencies
        run: |
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@ -20,13 +20,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
          architecture: x64
      - name: Validate website/meta/universe.json
        run: |
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)
-Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,5 +11,58 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 [tool.cibuildwheel]
 build = "*"
 skip = "pp* cp36* cp37* cp38* *-win32"
 test-skip = ""
 free-threaded-support = false
 archs = ["native"]
 build-frontend = "default"
 config-settings = {}
 dependency-versions = "pinned"
 environment = { PIP_CONSTRAINT = "build-constraints.txt" }
 environment-pass = []
 build-verbosity = 0
 before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
 before-build = "pip install -r requirements.txt && python setup.py clean"
 repair-wheel-command = ""
 test-command = ""
 before-test = ""
 test-requires = []
 test-extras = []
 container-engine = "docker"
 manylinux-x86_64-image = "manylinux2014"
 manylinux-i686-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"
 manylinux-ppc64le-image = "manylinux2014"
 manylinux-s390x-image = "manylinux2014"
 manylinux-pypy_x86_64-image = "manylinux2014"
 manylinux-pypy_i686-image = "manylinux2014"
 manylinux-pypy_aarch64-image = "manylinux2014"
 musllinux-x86_64-image = "musllinux_1_2"
 musllinux-i686-image = "musllinux_1_2"
 musllinux-aarch64-image = "musllinux_1_2"
 musllinux-ppc64le-image = "musllinux_1_2"
 musllinux-s390x-image = "musllinux_1_2"
 [tool.cibuildwheel.linux]
 repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
 [tool.cibuildwheel.macos]
 repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
 [tool.cibuildwheel.windows]
 [tool.cibuildwheel.pyodide]
 [tool.isort]
 profile = "black"
--- a/requirements.txt
+++ b/requirements.txt
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.10.0
+typer>=0.3.0,<1.0.0
 weasel>=0.1.0,<0.5.0
 # Third party dependencies
 numpy>=1.15.0; python_version < "3.9"
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
 typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -22,6 +22,7 @@ classifiers =
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
    Programming Language :: Python :: 3.12
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -55,7 +56,7 @@ install_requires =
    catalogue>=2.0.6,<2.1.0
    weasel>=0.1.0,<0.5.0
    # Third-party dependencies
-    typer>=0.3.0,<0.10.0
+    typer>=0.3.0,<1.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0; python_version < "3.9"
    numpy>=1.19.0; python_version >= "3.9"
@ -65,7 +66,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "3.8.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -39,7 +39,7 @@ def find_threshold_cli(
    # fmt: on
 ):
    """
-    Runs prediction trials for a trained model with varying tresholds to maximize
+    Runs prediction trials for a trained model with varying thresholds to maximize
    the specified metric. The search space for the threshold is traversed linearly
    from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
    (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@ -81,7 +81,7 @@ def find_threshold(
    silent: bool = True,
 ) -> Tuple[float, float, Dict[float, float]]:
    """
-    Runs prediction trials for models with varying tresholds to maximize the specified metric.
+    Runs prediction trials for models with varying thresholds to maximize the specified metric.
    model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
    data_path (Path): Path to file with DocBin with docs to use for threshold search.
    pipe_name (str): Name of pipe to examine thresholds for.
--- a/spacy/lang/bo/init.py
+++ b/spacy/lang/bo/init.py
@ -0,0 +1,16 @@
 from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 class TibetanDefaults(BaseDefaults):
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
 class Tibetan(Language):
    lang = "bo"
    Defaults = TibetanDefaults
 __all__ = ["Tibetan"]
--- a/spacy/lang/bo/examples.py
+++ b/spacy/lang/bo/examples.py
@ -0,0 +1,16 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.bo.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
    "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
    "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
    "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
    "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
    "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
 ]
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -0,0 +1,65 @@
 from ...attrs import LIKE_NUM
 # reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
 _num_words = [
    "ཀླད་ཀོར་",
    "གཅིག་",
    "གཉིས་",
    "གསུམ་",
    "བཞི་",
    "ལྔ་",
    "དྲུག་",
    "བདུན་",
    "བརྒྱད་",
    "དགུ་",
    "བཅུ་",
    "བཅུ་གཅིག་",
    "བཅུ་གཉིས་",
    "བཅུ་གསུམ་",
    "བཅུ་བཞི་",
    "བཅུ་ལྔ་",
    "བཅུ་དྲུག་",
    "བཅུ་བདུན་",
    "བཅུ་པརྒྱད",
    "བཅུ་དགུ་",
    "ཉི་ཤུ་",
    "སུམ་ཅུ",
    "བཞི་བཅུ",
    "ལྔ་བཅུ",
    "དྲུག་ཅུ",
    "བདུན་ཅུ",
    "བརྒྱད་ཅུ",
    "དགུ་བཅུ",
    "བརྒྱ་",
    "སྟོང་",
    "ཁྲི་",
    "ས་ཡ་",
    "	བྱེ་བ་",
    "དུང་ཕྱུར་",
    "ཐེར་འབུམ་",
    "ཐེར་འབུམ་ཆེན་པོ་",
    "ཁྲག་ཁྲིག་",
    "ཁྲག་ཁྲིག་ཆེན་པོ་",
 ]
 def like_num(text):
    """
    Check if text resembles a number
    """
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/bo/stop_words.py
+++ b/spacy/lang/bo/stop_words.py
@ -0,0 +1,198 @@
 # Source: https://zenodo.org/records/10148636
 STOP_WORDS = set(
    """
 འི་
 །
 དུ་
 གིས་
 སོགས་
 ཏེ
 གི་
 རྣམས་
 ནི
 ཀུན་
 ཡི་
 འདི
 ཀྱི་
 སྙེད་
 པས་
 གཞན་
 ཀྱིས་
 ཡི
 ལ
 ནི་
 དང་
 སོགས
 ཅིང་
 ར
 དུ
 མི་
 སུ་
 བཅས་
 ཡོངས་
 ལས
 ཙམ་
 གྱིས་
 དེ་
 ཡང་
 མཐའ་དག་
 ཏུ་
 ཉིད་
 ས
 ཏེ་
 གྱི་
 སྤྱི
 དེ
 ཀ་
 ཡིན་
 ཞིང་
 འདི་
 རུང་
 རང་
 ཞིག་
 སྟེ
 སྟེ་
 ན་རེ
 ངམ
 ཤིང་
 དག་
 ཏོ
 རེ་
 འང་
 ཀྱང་
 ལགས་པ
 ཚུ
 དོ
 ཡིན་པ
 རེ
 ན་རེ་
 ཨེ་
 ཚང་མ
 ཐམས་ཅད་
 དམ་
 འོ་
 ཅིག་
 གྱིན་
 ཡིན
 ན
 ཁོ་ན་
 འམ་
 ཀྱིན་
 ལོ
 ཀྱིས
 བས་
 ལགས་
 ཤིག
 གིས
 ཀི་
 སྣ་ཚོགས་
 རྣམས
 སྙེད་པ
 ཡིས་
 གྱི
 གི
 བམ་
 ཤིག་
 རེ་རེ་
 ནམ
 མིན་
 ནམ་
 ངམ་
 རུ་
 འགའ་
 ཀུན
 ཤས་
 ཏུ
 ཡིས
 གིན་
 གམ་
 འོ
 ཡིན་པ་
 མིན
 ལགས
 གྱིས
 ཅང་
 འགའ
 སམ་
 ཞིག
 འང
 ལས་ཆེ་
 འཕྲལ་
 བར་
 རུ
 དང
 ཡ
 འག
 སམ
 ཀ
 ཅུང་ཟད་
 ཅིག
 ཉིད
 དུ་མ
 མ
 ཡིན་བ
 འམ
 མམ
 དམ
 དག
 ཁོ་ན
 ཀྱི
 ལམ
 ཕྱི་
 ནང་
 ཙམ
 ནོ་
 སོ་
 རམ་
 བོ་
 ཨང་
 ཕྱི
 ཏོ་
 ཚོ
 ལ་ལ་
 ཚོ་
 ཅིང
 མ་གི་
 གེ
 གོ
 ཡིན་ལུགས་
 རོ་
 བོ
 ལགས་པ་
 པས
 རབ་
 འི
 རམ
 བས
 གཞན
 སྙེད་པ་
 འབའ་
 མཾ་
 པོ
 ག་
 ག
 གམ
 སྤྱི་
 བམ
 མོ་
 ཙམ་པ་
 ཤ་སྟག་
 མམ་
 རེ་རེ
 སྙེད
 ཏམ་
 ངོ
 གྲང་
 ཏ་རེ
 ཏམ
 ཁ་
 ངེ་
 ཅོག་
 རིལ་
 ཉུང་ཤས་
 གིང་
 ཚ་
 ཀྱང
 """.split()
 )
--- a/spacy/lang/gd/init.py
+++ b/spacy/lang/gd/init.py
@ -0,0 +1,18 @@
 from typing import Optional
 from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class ScottishDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    stop_words = STOP_WORDS
 class Scottish(Language):
    lang = "gd"
    Defaults = ScottishDefaults
 __all__ = ["Scottish"]
--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -0,0 +1,388 @@
 STOP_WORDS = set(
    """
 'ad
 'ar
 'd # iad
 'g # ag
 'ga
 'gam
 'gan
 'gar
 'gur
 'm # am
 'n # an
 'n seo
 'na
 'nad
 'nam
 'nan
 'nar
 'nuair
 'nur
 's
 'sa
 'san
 'sann
 'se
 'sna
 a
 a'
 a'd # agad
 a'm # agam
 a-chèile
 a-seo
 a-sin
 a-siud
 a chionn
 a chionn 's
 a chèile
 a chéile
 a dh'
 a h-uile
 a seo
 ac' # aca
 aca
 aca-san
 acasan
 ach
 ag
 agad
 agad-sa
 agads'
 agadsa
 agaibh
 agaibhse
 againn
 againne
 agam
 agam-sa
 agams'
 agamsa
 agus
 aice
 aice-se
 aicese
 aig
 aig' # aige
 aige
 aige-san
 aigesan
 air
 air-san
 air neo
 airsan
 am
 an
 an seo
 an sin
 an siud
 an uair
 ann
 ann a
 ann a'
 ann a shin
 ann am
 ann an
 annad
 annam
 annam-s'
 annamsa
 anns
 anns an
 annta
 aon
 ar
 as
 asad
 asda
 asta
 b'
 bho
 bhon
 bhuaidhe # bhuaithe
 bhuainn
 bhuaipe
 bhuaithe
 bhuapa
 bhur
 brì
 bu
 c'à
 car son
 carson
 cha
 chan
 chionn
 choir
 chon
 chun
 chèile
 chéile
 chòir
 cia mheud
 ciamar
 co-dhiubh
 cuide
 cuin
 cuin'
 cuine
 cà
 cà'
 càil
 càit
 càit'
 càite
 cò
 cò mheud
 có
 d'
 da
 de
 dh'
 dha
 dhaibh
 dhaibh-san
 dhaibhsan
 dhan
 dhasan
 dhe
 dhen
 dheth
 dhi
 dhiom
 dhiot
 dhith
 dhiubh
 dhomh
 dhomh-s'
 dhomhsa
 dhu'sa # dhut-sa
 dhuibh
 dhuibhse
 dhuinn
 dhuinne
 dhuit
 dhut
 dhutsa
 dhut-sa
 dhà
 dhà-san
 dhàsan
 dhòmhsa
 diubh
 do
 docha
 don
 dà
 dè
 dè mar
 dé
 dé mar
 dòch'
 dòcha
 e
 eadar
 eatarra
 eatorra
 eile
 esan
 fa
 far
 feud
 fhad
 fheudar
 fhearr
 fhein
 fheudar
 fheàrr
 fhèin
 fhéin
 fhìn
 fo
 fodha
 fodhainn
 foipe
 fon
 fèin
 ga
 gach
 gam
 gan
 ge brith
 ged
 gu
 gu dè
 gu ruige
 gun
 gur
 gus
 i
 iad
 iadsan
 innte
 is
 ise
 le
 leam
 leam-sa
 leamsa
 leat
 leat-sa
 leatha
 leatsa
 leibh
 leis
 leis-san
 leoth'
 leotha
 leotha-san
 linn
 m'
 m'a
 ma
 mac
 man
 mar
 mas
 mathaid
 mi
 mis'
 mise
 mo
 mu
 mu 'n
 mun
 mur
 mura
 mus
 na
 na b'
 na bu
 na iad
 nach
 nad
 nam
 nan
 nar
 nas
 neo
 no
 nuair
 o
 o'n
 oir
 oirbh
 oirbh-se
 oirnn
 oirnne
 oirre
 on
 orm
 orm-sa
 ormsa
 orra
 orra-san
 orrasan
 ort
 os
 r'
 ri
 ribh
 rinn
 ris
 rithe
 rithe-se
 rium
 rium-sa
 riums'
 riumsa
 riut
 riuth'
 riutha
 riuthasan
 ro
 ro'n
 roimh
 roimhe
 romhainn
 romham
 romhpa
 ron
 ruibh
 ruinn
 ruinne
 sa
 san
 sann
 se
 seach
 seo
 seothach
 shin
 sibh
 sibh-se
 sibhse
 sin
 sineach
 sinn
 sinne
 siod
 siodach
 siud
 siudach
 sna # ann an
 sè
 t'
 tarsaing
 tarsainn
 tarsuinn
 thar
 thoigh
 thro
 thu
 thuc'
 thuca
 thugad
 thugaibh
 thugainn
 thugam
 thugamsa
 thuice
 thuige
 thus'
 thusa
 timcheall
 toigh
 toil
 tro
 tro' # troimh
 troimh
 troimhe
 tron
 tu
 tusa
 uair
 ud
 ugaibh
 ugam-s'
 ugam-sa
 uice
 uige
 uige-san
 umad
 unnta # ann an
 ur
 urrainn
 à
 às
 àsan
 á
 ás
 è
 ì
 ò
 ó
 """.split(
        "\n"
    )
 )
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -0,0 +1,16 @@
 from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 class KurmanjiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
 class Kurmanji(Language):
    lang = "kmr"
    Defaults = KurmanjiDefaults
 __all__ = ["Kurmanji"]
--- a/spacy/lang/kmr/examples.py
+++ b/spacy/lang/kmr/examples.py
@ -0,0 +1,17 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.kmr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
 ]
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,138 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "sifir",
    "yek",
    "du",
    "sê",
    "çar",
    "pênc",
    "şeş",
    "heft",
    "heşt",
    "neh",
    "deh",
    "yazde",
    "dazde",
    "sêzde",
    "çarde",
    "pazde",
    "şazde",
    "hevde",
    "hejde",
    "nozde",
    "bîst",
    "sî",
    "çil",
    "pêncî",
    "şêst",
    "heftê",
    "heştê",
    "nod",
    "sed",
    "hezar",
    "milyon",
    "milyar",
 ]
 _ordinal_words = [
    "yekem",
    "yekemîn",
    "duyem",
    "duyemîn",
    "sêyem",
    "sêyemîn",
    "çarem",
    "çaremîn",
    "pêncem",
    "pêncemîn",
    "şeşem",
    "şeşemîn",
    "heftem",
    "heftemîn",
    "heştem",
    "heştemîn",
    "nehem",
    "nehemîn",
    "dehem",
    "dehemîn",
    "yazdehem",
    "yazdehemîn",
    "dazdehem",
    "dazdehemîn",
    "sêzdehem",
    "sêzdehemîn",
    "çardehem",
    "çardehemîn",
    "pazdehem",
    "pazdehemîn",
    "şanzdehem",
    "şanzdehemîn",
    "hevdehem",
    "hevdehemîn",
    "hejdehem",
    "hejdehemîn",
    "nozdehem",
    "nozdehemîn",
    "bîstem",
    "bîstemîn",
    "sîyem",
    "sîyemîn",
    "çilem",
    "çilemîn",
    "pêncîyem",
    "pênciyemîn",
    "şêstem",
    "şêstemîn",
    "heftêyem",
    "heftêyemîn",
    "heştêyem",
    "heştêyemîn",
    "notem",
    "notemîn",
    "sedem",
    "sedemîn",
    "hezarem",
    "hezaremîn",
    "milyonem",
    "milyonemîn",
    "milyarem",
    "milyaremîn",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    if is_digit(text_lower):
        return True
    return False
 def is_digit(text):
    endings = ("em", "yem", "emîn", "yemîn")
    for ending in endings:
        to = len(ending)
        if text.endswith(ending) and text[:-to].isdigit():
            return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/kmr/stop_words.py
+++ b/spacy/lang/kmr/stop_words.py
@ -0,0 +1,44 @@
 STOP_WORDS = set(
    """
 û
 li
 bi
 di
 da
 de
 ji
 ku
 ew
 ez
 tu
 em
 hûn
 ew
 ev
 min
 te
 wî
 wê
 me
 we
 wan
 vê
 vî
 va
 çi
 kî
 kê
 çawa
 çima
 kengî
 li ku
 çend
 çiqas
 her
 hin
 gelek
 hemû
 kes
 tişt
 """.split()
 )
--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return MacedonianLemmatizer(lookups)
 class Macedonian(Language):
    lang = "mk"
    Defaults = MacedonianDefaults
--- a/spacy/language.py
+++ b/spacy/language.py
@ -5,7 +5,7 @@ import multiprocessing as mp
 import random
 import traceback
 import warnings
-from contextlib import contextmanager
+from contextlib import ExitStack, contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from itertools import chain, cycle
@ -31,6 +31,7 @@ from typing import (
 )
 import srsly
 from cymem.cymem import Pool
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops
 from . import about, ty, util
@ -2091,6 +2092,38 @@ class Language:
                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
                tok2vec.remove_listener(listener, pipe_name)
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
        """Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        Example
        -------
        >>> with nlp.memory_zone():
        ...     for doc in nlp.pipe(texts):
        ...        process_my_doc(doc)
        >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
        """
        if mem is None:
            mem = Pool()
        # The ExitStack allows programmatic nested context managers.
        # We don't know how many we need, so it would be awkward to have
        # them as nested blocks.
        with ExitStack() as stack:
            contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
            if hasattr(self.tokenizer, "memory_zone"):
                contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
            for _, pipe in self.pipeline:
                if hasattr(pipe, "memory_zone"):
                    contexts.append(stack.enter_context(pipe.memory_zone(mem)))
            yield mem
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -164,41 +164,44 @@ cdef class Lexeme:
        vector = self.vector
        return numpy.sqrt((vector**2).sum())
-    property vector:
+    @property
    def vector(self):
        """A real-valued meaning representation.
        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the lexeme's semantics.
        """
        def __get__(self):
        cdef int length = self.vocab.vectors_length
        if length == 0:
            raise ValueError(Errors.E010)
        return self.vocab.get_vector(self.c.orth)
-        def __set__(self, vector):
+    @vector.setter
    def vector(self, vector):
        if len(vector) != self.vocab.vectors_length:
            raise ValueError(Errors.E073.format(new_length=len(vector),
                                                length=self.vocab.vectors_length))
        self.vocab.set_vector(self.c.orth, vector)
-    property rank:
+    @property
    def rank(self):
        """RETURNS (str): Sequential ID of the lexeme's lexical type, used
            to index into tables, e.g. for word vectors."""
        def __get__(self):
        return self.c.id
-        def __set__(self, value):
+    @rank.setter
    def rank(self, value):
        self.c.id = value
-    property sentiment:
+    @property
    def sentiment(self):
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the lexeme."""
        def __get__(self):
        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
        return sentiment_table.get(self.c.orth, 0.0)
-        def __set__(self, float x):
+    @sentiment.setter
    def sentiment(self, float x):
        if "lexeme_sentiment" not in self.vocab.lookups:
            self.vocab.lookups.add_table("lexeme_sentiment")
        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
@ -216,151 +219,166 @@ cdef class Lexeme:
        """RETURNS (str): The original verbatim text of the lexeme."""
        return self.orth_
-    property lower:
+    @property
    def lower(self):
        """RETURNS (uint64): Lowercase form of the lexeme."""
        def __get__(self):
        return self.c.lower
-        def __set__(self, attr_t x):
+    @lower.setter
    def lower(self, attr_t x):
        self.c.lower = x
-    property norm:
+    @property
    def norm(self):
        """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
        return self.c.norm
-        def __set__(self, attr_t x):
+    @norm.setter
    def norm(self, attr_t x):
        if "lexeme_norm" not in self.vocab.lookups:
            self.vocab.lookups.add_table("lexeme_norm")
        norm_table = self.vocab.lookups.get_table("lexeme_norm")
        norm_table[self.c.orth] = self.vocab.strings[x]
        self.c.norm = x
-    property shape:
+    @property
    def shape(self):
        """RETURNS (uint64): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
        return self.c.shape
-        def __set__(self, attr_t x):
+    @shape.setter
    def shape(self, attr_t x):
        self.c.shape = x
-    property prefix:
+    @property
    def prefix(self):
        """RETURNS (uint64): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
        return self.c.prefix
-        def __set__(self, attr_t x):
+    @prefix.setter
    def prefix(self, attr_t x):
        self.c.prefix = x
-    property suffix:
+    @property
    def suffix(self):
        """RETURNS (uint64): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
        return self.c.suffix
-        def __set__(self, attr_t x):
+    @suffix.setter
    def suffix(self, attr_t x):
        self.c.suffix = x
-    property cluster:
+    @property
    def cluster(self):
        """RETURNS (int): Brown cluster ID."""
        def __get__(self):
        cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
        return cluster_table.get(self.c.orth, 0)
-        def __set__(self, int x):
+    @cluster.setter
    def cluster(self, int x):
        cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
        cluster_table[self.c.orth] = x
-    property lang:
+    @property
    def lang(self):
        """RETURNS (uint64): Language of the parent vocabulary."""
        def __get__(self):
        return self.c.lang
-        def __set__(self, attr_t x):
+    @lang.setter
    def lang(self, attr_t x):
        self.c.lang = x
-    property prob:
+    @property
    def prob(self):
        """RETURNS (float): Smoothed log probability estimate of the lexeme's
            type."""
        def __get__(self):
        prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
        settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
        default_oov_prob = settings_table.get("oov_prob", -20.0)
        return prob_table.get(self.c.orth, default_oov_prob)
-        def __set__(self, float x):
+    @prob.setter
    def prob(self, float x):
        prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
        prob_table[self.c.orth] = x
-    property lower_:
+    @property
    def lower_(self):
        """RETURNS (str): Lowercase form of the word."""
        def __get__(self):
        return self.vocab.strings[self.c.lower]
-        def __set__(self, str x):
+    @lower_.setter
    def lower_(self, str x):
        self.c.lower = self.vocab.strings.add(x)
-    property norm_:
+    @property
    def norm_(self):
        """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
        return self.vocab.strings[self.c.norm]
-        def __set__(self, str x):
+    @norm_.setter
    def norm_(self, str x):
        self.norm = self.vocab.strings.add(x)
-    property shape_:
+    @property
    def shape_(self):
        """RETURNS (str): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
        return self.vocab.strings[self.c.shape]
-        def __set__(self, str x):
+    @shape_.setter
    def shape_(self, str x):
        self.c.shape = self.vocab.strings.add(x)
-    property prefix_:
+    @property
    def prefix_(self):
        """RETURNS (str): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
        return self.vocab.strings[self.c.prefix]
-        def __set__(self, str x):
+    @prefix_.setter
    def prefix_(self, str x):
        self.c.prefix = self.vocab.strings.add(x)
-    property suffix_:
+    @property
    def suffix_(self):
        """RETURNS (str): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
        return self.vocab.strings[self.c.suffix]
-        def __set__(self, str x):
+    @suffix_.setter
    def suffix_(self, str x):
        self.c.suffix = self.vocab.strings.add(x)
-    property lang_:
+    @property
    def lang_(self):
        """RETURNS (str): Language of the parent vocabulary."""
        def __get__(self):
        return self.vocab.strings[self.c.lang]
-        def __set__(self, str x):
+    @lang_.setter
    def lang_(self, str x):
        self.c.lang = self.vocab.strings.add(x)
-    property flags:
+    @property
    def flags(self):
        """RETURNS (uint64): Container of the lexeme's binary flags."""
        def __get__(self):
        return self.c.flags
-        def __set__(self, flags_t x):
+    @flags.setter
    def flags(self, flags_t x):
        self.c.flags = x
    @property
@ -368,154 +386,171 @@ cdef class Lexeme:
        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
        return self.orth not in self.vocab.vectors
-    property is_stop:
+    @property
    def is_stop(self):
        """RETURNS (bool): Whether the lexeme is a stop word."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_STOP)
-        def __set__(self, bint x):
+    @is_stop.setter
    def is_stop(self, bint x):
        Lexeme.c_set_flag(self.c, IS_STOP, x)
-    property is_alpha:
+    @property
    def is_alpha(self):
        """RETURNS (bool): Whether the lexeme consists of alphabetic
            characters. Equivalent to `lexeme.text.isalpha()`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_ALPHA)
-        def __set__(self, bint x):
+    @is_alpha.setter
    def is_alpha(self, bint x):
        Lexeme.c_set_flag(self.c, IS_ALPHA, x)
-    property is_ascii:
+    @property
    def is_ascii(self):
        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_ASCII)
-        def __set__(self, bint x):
+    @is_ascii.setter
    def is_ascii(self, bint x):
        Lexeme.c_set_flag(self.c, IS_ASCII, x)
-    property is_digit:
+    @property
    def is_digit(self):
        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
            to `lexeme.text.isdigit()`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_DIGIT)
-        def __set__(self, bint x):
+    @is_digit.setter
    def is_digit(self, bint x):
        Lexeme.c_set_flag(self.c, IS_DIGIT, x)
-    property is_lower:
+    @property
    def is_lower(self):
        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
            `lexeme.text.islower()`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_LOWER)
-        def __set__(self, bint x):
+    @is_lower.setter
    def is_lower(self, bint x):
        Lexeme.c_set_flag(self.c, IS_LOWER, x)
-    property is_upper:
+    @property
    def is_upper(self):
        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
            `lexeme.text.isupper()`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_UPPER)
-        def __set__(self, bint x):
+    @is_upper.setter
    def is_upper(self, bint x):
        Lexeme.c_set_flag(self.c, IS_UPPER, x)
-    property is_title:
+    @property
    def is_title(self):
        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
            `lexeme.text.istitle()`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_TITLE)
-        def __set__(self, bint x):
+    @is_title.setter
    def is_title(self, bint x):
        Lexeme.c_set_flag(self.c, IS_TITLE, x)
-    property is_punct:
+    @property
    def is_punct(self):
        """RETURNS (bool): Whether the lexeme is punctuation."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_PUNCT)
-        def __set__(self, bint x):
+    @is_punct.setter
    def is_punct(self, bint x):
        Lexeme.c_set_flag(self.c, IS_PUNCT, x)
-    property is_space:
+    @property
    def is_space(self):
        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
            Equivalent to `lexeme.text.isspace()`.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_SPACE)
-        def __set__(self, bint x):
+    @is_space.setter
    def is_space(self, bint x):
        Lexeme.c_set_flag(self.c, IS_SPACE, x)
-    property is_bracket:
+    @property
    def is_bracket(self):
        """RETURNS (bool): Whether the lexeme is a bracket."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_BRACKET)
-        def __set__(self, bint x):
+    @is_bracket.setter
    def is_bracket(self, bint x):
        Lexeme.c_set_flag(self.c, IS_BRACKET, x)
-    property is_quote:
+    @property
    def is_quote(self):
        """RETURNS (bool): Whether the lexeme is a quotation mark."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_QUOTE)
-        def __set__(self, bint x):
+    @is_quote.setter
    def is_quote(self, bint x):
        Lexeme.c_set_flag(self.c, IS_QUOTE, x)
-    property is_left_punct:
+    @property
    def is_left_punct(self):
        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
-        def __set__(self, bint x):
+    @is_left_punct.setter
    def is_left_punct(self, bint x):
        Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
-    property is_right_punct:
+    @property
    def is_right_punct(self):
        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
-        def __set__(self, bint x):
+    @is_right_punct.setter
    def is_right_punct(self, bint x):
        Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
-    property is_currency:
+    @property
    def is_currency(self):
        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, IS_CURRENCY)
-        def __set__(self, bint x):
+    @is_currency.setter
    def is_currency(self, bint x):
        Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
-    property like_url:
+    @property
    def like_url(self):
        """RETURNS (bool): Whether the lexeme resembles a URL."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, LIKE_URL)
-        def __set__(self, bint x):
+    @like_url.setter
    def like_url(self, bint x):
        Lexeme.c_set_flag(self.c, LIKE_URL, x)
-    property like_num:
+    @property
    def like_num(self):
        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
            "10", "ten", etc.
        """
        def __get__(self):
        return Lexeme.c_check_flag(self.c, LIKE_NUM)
-        def __set__(self, bint x):
+    @like_num.setter
    def like_num(self, bint x):
        Lexeme.c_set_flag(self.c, LIKE_NUM, x)
-    property like_email:
+    @property
    def like_email(self):
        """RETURNS (bool): Whether the lexeme resembles an email address."""
        def __get__(self):
        return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
-        def __set__(self, bint x):
+    @like_email.setter
    def like_email(self, bint x):
        Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
    def __init__(self, ArcEager moves, StateClass stcls, Example example):
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
-        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
+        labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
        sent_starts = _get_aligned_sent_starts(example)
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
            doc.c[i].head = new_head.i - i
-            doc.c[i].dep = doc.vocab.strings.add(new_label)
+            doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
    set_children_from_heads(doc.c, 0, doc.length)
    return doc
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -11,7 +11,6 @@ from .. import util
 from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..ml import empty_kb
 from ..scorer import Scorer
 from ..tokens import Doc, Span
 from ..training import Example, validate_examples, validate_get_examples
@ -105,7 +104,7 @@ def make_entity_linker(
        ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
    scorer (Optional[Callable]): The scoring method.
-    use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
+    use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
        component must provide entity annotations.
    candidates_batch_size (int): Size of batches for entity candidate generation.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
@ -235,7 +234,6 @@ class EntityLinker(TrainablePipe):
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
        self.scorer = scorer
        self.use_gold_ents = use_gold_ents
        self.candidates_batch_size = candidates_batch_size
        self.threshold = threshold
@ -243,6 +241,37 @@ class EntityLinker(TrainablePipe):
        if candidates_batch_size < 1:
            raise ValueError(Errors.E1044)
        def _score_with_ents_set(examples: Iterable[Example], **kwargs):
            # Because of how spaCy works, we can't just score immediately, because Language.evaluate
            # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
            if not scorer:
                return scorer
            if not self.use_gold_ents:
                return scorer(examples, **kwargs)
            else:
                examples = self._ensure_ents(examples)
                docs = self.pipe(
                    (eg.predicted for eg in examples),
                )
                for eg, doc in zip(examples, docs):
                    eg.predicted = doc
                return scorer(examples, **kwargs)
        self.scorer = _score_with_ents_set
    def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
        """If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
        if not self.use_gold_ents:
            return examples
        new_examples = []
        for eg in examples:
            ents, _ = eg.get_aligned_ents_and_ner()
            new_eg = eg.copy()
            new_eg.predicted.ents = ents
            new_examples.append(new_eg)
        return new_examples
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
        create it using this object's vocab."""
@ -284,11 +313,9 @@ class EntityLinker(TrainablePipe):
        nO = self.kb.entity_vector_length
        doc_sample = []
        vector_sample = []
-        for eg in islice(get_examples(), 10):
+        examples = self._ensure_ents(islice(get_examples(), 10))
        for eg in examples:
            doc = eg.x
            if self.use_gold_ents:
                ents, _ = eg.get_aligned_ents_and_ner()
                doc.ents = ents
            doc_sample.append(doc)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
@ -354,31 +381,17 @@ class EntityLinker(TrainablePipe):
        losses.setdefault(self.name, 0.0)
        if not examples:
            return losses
        examples = self._ensure_ents(examples)
        validate_examples(examples, "EntityLinker.update")
        set_dropout_rate(self.model, drop)
        docs = [eg.predicted for eg in examples]
        # save to restore later
        old_ents = [doc.ents for doc in docs]
        for doc, ex in zip(docs, examples):
            if self.use_gold_ents:
                ents, _ = ex.get_aligned_ents_and_ner()
                doc.ents = ents
            else:
                # only keep matching ents
                doc.ents = ex.get_matching_ents()
        # make sure we have something to learn from, if not, short-circuit
        if not self.batch_has_learnable_example(examples):
            return losses
        set_dropout_rate(self.model, drop)
        docs = [eg.predicted for eg in examples]
        sentence_encodings, bp_context = self.model.begin_update(docs)
        # now restore the ents
        for doc, old in zip(docs, old_ents):
            doc.ents = old
        loss, d_scores = self.get_loss(
            sentence_encodings=sentence_encodings, examples=examples
        )
@ -386,11 +399,13 @@ class EntityLinker(TrainablePipe):
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        return losses
    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker.get_loss")
        entity_encodings = []
        # We assume that get_loss is called with gold ents set in the examples if need be
        eidx = 0  # indices in gold entities to keep
        keep_ents = []  # indices in sentence_encodings to keep
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -25,5 +25,7 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map
-    cdef const Utf8Str* intern_unicode(self, str py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) 
    cdef vector[hash_t] _transient_keys
    cdef Pool _non_temp_mem
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,9 +1,14 @@
 # cython: infer_types=True
 # cython: profile=False
 cimport cython
 from contextlib import contextmanager
 from typing import Iterator, List, Optional
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash32, hash64
 from preshed.maps cimport map_clear
 import srsly
@ -119,10 +124,11 @@ cdef class StringStore:
        strings (iterable): A sequence of unicode strings to add to the store.
        """
        self.mem = Pool()
        self._non_temp_mem = self.mem
        self._map = PreshMap()
        if strings is not None:
            for string in strings:
-                self.add(string)
+                self.add(string, allow_transient=False)
    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.
@ -152,10 +158,13 @@ cdef class StringStore:
                return SYMBOLS_BY_INT[str_hash]
            else:
                utf8str = <Utf8Str*>self._map.get(str_hash)
                if utf8str is NULL:
                    raise KeyError(Errors.E018.format(hash_value=string_or_id))
                else:
                    return decode_Utf8Str(utf8str)
        else:
            # TODO: Raise an error instead
            utf8str = <Utf8Str*>self._map.get(string_or_id)
            if utf8str is NULL:
                raise KeyError(Errors.E018.format(hash_value=string_or_id))
            else:
@ -175,12 +184,46 @@ cdef class StringStore:
        else:
            return self[key]
-    def add(self, string):
+    def __len__(self) -> int:
        """The number of strings in the store.
        RETURNS (int): The number of strings in the store.
        """
        return self.keys.size() + self._transient_keys.size()
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
        """Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        """
        if mem is None:
            mem = Pool()
        self.mem = mem
        yield mem
        for key in self._transient_keys:
            map_clear(self._map.c_map, key)
        self._transient_keys.clear()
        self.mem = self._non_temp_mem
    def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
        """Add a string to the StringStore.
        string (str): The string to add.
        allow_transient (bool): Allow the string to be stored in the 'transient'
          map, which will be flushed at the end of the memory zone. Strings
          encountered during arbitrary text processing should be added
          with allow_transient=True, while labels and other strings used
          internally should not.
        RETURNS (uint64): The string's hash value.
        """
        if allow_transient is None:
            allow_transient = self.mem is not self._non_temp_mem
        cdef hash_t str_hash
        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
@ -188,22 +231,26 @@ cdef class StringStore:
            string = string.encode("utf8")
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        elif isinstance(string, bytes):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        else:
            raise TypeError(Errors.E017.format(value_type=type(string)))
        return str_hash
    def __len__(self):
        """The number of strings in the store.
        if string in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string]
        else:
            return self._intern_str(string, allow_transient)
        RETURNS (int): The number of strings in the store.
        """
-        return self.keys.size()
+        return self.keys.size() + self._transient_keys.size()
    def __contains__(self, string_or_id not None):
        """Check whether a string or ID is in the store.
@ -222,12 +269,17 @@ cdef class StringStore:
            pass
        else:
            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            if self._map.get(string_or_id) is not NULL:
-
+                return True
            else:
                return False
        if str_hash < len(SYMBOLS_BY_INT):
            return True
        else:
-            return self._map.get(str_hash) is not NULL
+            if self._map.get(str_hash) is not NULL:
                return True
            else:
                return False
    def __iter__(self):
        """Iterate over the strings in the store, in order.
@ -240,12 +292,29 @@ cdef class StringStore:
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        for i in range(self._transient_keys.size()):
            key = self._transient_keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
    def __reduce__(self):
        strings = list(self)
        return (StringStore, (strings,), None, None, None)
    def values(self) -> List[int]:
        """Iterate over the stored strings hashes in insertion order.
        RETURNS: A list of string hashs.
        """
        cdef int i
        hashes = [None] * self._keys.size()
        for i in range(self._keys.size()):
            hashes[i] = self._keys[i]
        transient_hashes = [None] * self._transient_keys.size()
        for i in range(self._transient_keys.size()):
            transient_hashes[i] = self._transient_keys[i]
        return hashes + transient_hashes
    def to_disk(self, path):
        """Save the current state to a directory.
@ -269,7 +338,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self
    def to_bytes(self, **kwargs):
@ -289,23 +358,25 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self
    def _reset_and_load(self, strings):
        self.mem = Pool()
        self._non_temp_mem = self.mem
        self._map = PreshMap()
        self.keys.clear()
        self._transient_keys.clear()
        for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)
-    cdef const Utf8Str* intern_unicode(self, str py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+        return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
@ -314,5 +385,8 @@ cdef class StringStore:
            return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
        if allow_transient and self.mem is not self._non_temp_mem:
            self._transient_keys.push_back(key)
        else:
            self.keys.push_back(key)
        return value
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -81,6 +81,11 @@ def bn_tokenizer():
    return get_lang_class("bn")().tokenizer
@pytest.fixture(scope="session")
 def bo_tokenizer():
    return get_lang_class("bo")().tokenizer
@pytest.fixture(scope="session")
 def ca_tokenizer():
    return get_lang_class("ca")().tokenizer
--- a/spacy/tests/lang/bo/init.py
+++ b/spacy/tests/lang/bo/init.py
--- a/spacy/tests/lang/bo/test_text.py
+++ b/spacy/tests/lang/bo/test_text.py
@ -0,0 +1,21 @@
 import pytest
@pytest.mark.parametrize(
    "text,match",
    [
        ("10", True),
        ("1", True),
        ("999.0", True),
        ("གཅིག་", True),
        ("གཉིས་", True),
        ("ཀླད་ཀོར་", True),
        ("བཅུ་གཅིག་", True),
        ("ཁྱི་", False),
        (",", False),
    ],
 )
 def test_lex_attrs_like_number(bo_tokenizer, text, match):
    tokens = bo_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].like_num == match
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,27 @@
 import pytest
 from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
    "word",
    [
        "yekem",
        "duyemîn",
        "100em",
        "dehem",
        "sedemîn",
        "34em",
        "30yem",
        "20emîn",
        "50yemîn",
    ],
 )
 def test_kmr_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)
@pytest.mark.parametrize("word", ["deh"])
 def test_kmr_lex_attrs_capitals(word):
    assert like_num(word)
    assert like_num(word.upper())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
 # fmt: on
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -717,7 +717,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
-def test_overfitting_IO():
+def test_overfitting_IO_gold_entities():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    vector_length = 3
@ -744,7 +744,9 @@ def test_overfitting_IO():
        return mykb
    # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker = nlp.add_pipe(
        "entity_linker", last=True, config={"use_gold_ents": True}
    )
    assert isinstance(entity_linker, EntityLinker)
    entity_linker.set_kb(create_kb)
    assert "Q2146908" in entity_linker.vocab.strings
@ -807,6 +809,107 @@ def test_overfitting_IO():
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
    eval = nlp.evaluate(train_examples)
    assert "nel_macro_p" in eval
    assert "nel_macro_r" in eval
    assert "nel_macro_f" in eval
    assert "nel_micro_p" in eval
    assert "nel_micro_r" in eval
    assert "nel_micro_f" in eval
    assert "nel_f_per_type" in eval
    assert "PERSON" in eval["nel_f_per_type"]
    assert eval["nel_macro_f"] > 0
    assert eval["nel_micro_f"] > 0
 def test_overfitting_IO_with_ner():
    # Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
    nlp = English()
    vector_length = 3
    assert "Q2146908" not in nlp.vocab.strings
    # Convert the texts to docs to make sure we have doc.ents set for the training examples
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))
    def create_kb(vocab):
        # create artificial KB - assign same prior weight to the two russ cochran's
        # Q2146908 (Russ Cochran): American golfer
        # Q7381115 (Russ Cochran): publisher
        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="Russ Cochran",
            entities=["Q2146908", "Q7381115"],
            probabilities=[0.5, 0.5],
        )
        return mykb
    # Create the NER and EL components and add them to the pipeline
    ner = nlp.add_pipe("ner", first=True)
    entity_linker = nlp.add_pipe(
        "entity_linker", last=True, config={"use_gold_ents": False}
    )
    entity_linker.set_kb(create_kb)
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.initialize()
    # train the NER and NEL pipes
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["ner"] < 0.001
    assert losses["entity_linker"] < 0.001
    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)
    # test the trained model
    test_text = "Russ Cochran captured his first major title with his son as caddie."
    doc = nlp(test_text)
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "Russ Cochran"
    assert ents[0].label_ == "PERSON"
    assert ents[0].kb_id_ != "NIL"
    # TODO: below assert is still flaky - EL doesn't properly overfit quite yet
    # assert ents[0].kb_id_ == "Q2146908"
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        assert nlp2.pipe_names == nlp.pipe_names
        doc2 = nlp2(test_text)
        ents2 = doc2.ents
        assert len(ents2) == 1
        assert ents2[0].text == "Russ Cochran"
        assert ents2[0].label_ == "PERSON"
        assert ents2[0].kb_id_ != "NIL"
    eval = nlp.evaluate(train_examples)
    assert "nel_macro_f" in eval
    assert "nel_micro_f" in eval
    assert "ents_f" in eval
    assert "nel_f_per_type" in eval
    assert "ents_per_type" in eval
    assert "PERSON" in eval["nel_f_per_type"]
    assert "PERSON" in eval["ents_per_type"]
    assert eval["nel_macro_f"] > 0
    assert eval["nel_micro_f"] > 0
    assert eval["ents_f"] > 0
 def test_kb_serialization():
    # Test that the KB can be used in a pipeline with a different vocab
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process):
        nlp.set_error_handler(raise_error)
        with pytest.raises(ValueError):
            list(nlp.pipe(texts, n_process=n_process))
-        # set explicitely to ignoring
+        # set explicitly to ignoring
        nlp.set_error_handler(ignore_error)
        docs = list(nlp.pipe(texts, n_process=n_process))
        assert len(docs) == 0
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -18,6 +18,7 @@ LANGUAGES = [
    pytest.param("ar", marks=pytest.mark.slow()),
    pytest.param("bg", marks=pytest.mark.slow()),
    "bn",
    pytest.param("bo", marks=pytest.mark.slow()),
    pytest.param("ca", marks=pytest.mark.slow()),
    pytest.param("cs", marks=pytest.mark.slow()),
    pytest.param("da", marks=pytest.mark.slow()),
@ -57,6 +58,7 @@ LANGUAGES = [
    pytest.param("tr", marks=pytest.mark.slow()),
    pytest.param("tt", marks=pytest.mark.slow()),
    pytest.param("ur", marks=pytest.mark.slow()),
    pytest.param("kmr", marks=pytest.mark.slow()),
 ]
--- a/spacy/tests/vocab_vectors/test_memory_zone.py
+++ b/spacy/tests/vocab_vectors/test_memory_zone.py
@ -0,0 +1,36 @@
 from spacy.vocab import Vocab
 def test_memory_zone_no_insertion():
    vocab = Vocab()
    with vocab.memory_zone():
        pass
    lex = vocab["horse"]
    assert lex.text == "horse"
 def test_memory_zone_insertion():
    vocab = Vocab()
    _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
    with vocab.memory_zone():
        lex = vocab["horse"]
        assert lex.text == "horse"
    assert "dog" in vocab
    assert "horse" not in vocab
 def test_memory_zone_redundant_insertion():
    """Test that if we insert an already-existing word while
    in the memory zone, it stays persistent"""
    vocab = Vocab()
    _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
    with vocab.memory_zone():
        lex = vocab["horse"]
        assert lex.text == "horse"
        _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -25,9 +25,7 @@ cdef class Tokenizer:
    cdef PhraseMatcher _special_matcher
    # TODO convert to bool in v4
    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
+    cdef public int max_cache_size
    # https://github.com/explosion/spaCy/pull/9150
    cdef int _unused_int2
    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -30,7 +30,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
        vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
        faster_heuristics (bool): Whether to restrict the final
            Matcher-based pass for rules to those containing affixes or space.
            Defaults to True.
        max_cache_size (int): Maximum number of tokenization chunks to cache.
        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -69,52 +70,59 @@ cdef class Tokenizer:
        self._rules = {}
        self._special_matcher = PhraseMatcher(self.vocab)
        self._load_special_cases(rules)
        self.max_cache_size = max_cache_size
-    property token_match:
+    @property
-        def __get__(self):
+    def token_match(self):
        return self._token_match
-        def __set__(self, token_match):
+    @token_match.setter
    def token_match(self, token_match):
        self._token_match = token_match
        self._reload_special_cases()
-    property url_match:
+    @property
-        def __get__(self):
+    def url_match(self):
        return self._url_match
-        def __set__(self, url_match):
+    @url_match.setter
    def url_match(self, url_match):
        self._url_match = url_match
        self._reload_special_cases()
-    property prefix_search:
+    @property
-        def __get__(self):
+    def prefix_search(self):
        return self._prefix_search
-        def __set__(self, prefix_search):
+    @prefix_search.setter
    def prefix_search(self, prefix_search):
        self._prefix_search = prefix_search
        self._reload_special_cases()
-    property suffix_search:
+    @property
-        def __get__(self):
+    def suffix_search(self):
        return self._suffix_search
-        def __set__(self, suffix_search):
+    @suffix_search.setter
    def suffix_search(self, suffix_search):
        self._suffix_search = suffix_search
        self._reload_special_cases()
-    property infix_finditer:
+    @property
-        def __get__(self):
+    def infix_finditer(self):
        return self._infix_finditer
-        def __set__(self, infix_finditer):
+    @infix_finditer.setter
    def infix_finditer(self, infix_finditer):
        self._infix_finditer = infix_finditer
        self._reload_special_cases()
-    property rules:
+    @property
-        def __get__(self):
+    def rules(self):
        return self._rules
-        def __set__(self, rules):
+    @rules.setter
    def rules(self, rules):
        self._rules = {}
        self._flush_cache()
        self._flush_specials()
@ -122,11 +130,12 @@ cdef class Tokenizer:
        self._specials = PreshMap()
        self._load_special_cases(rules)
-    property faster_heuristics:
+    @property
-        def __get__(self):
+    def faster_heuristics(self):
        return bool(self._faster_heuristics)
-        def __set__(self, faster_heuristics):
+    @faster_heuristics.setter
    def faster_heuristics(self, faster_heuristics):
        self._faster_heuristics = bool(faster_heuristics)
        self._reload_special_cases()
@ -390,6 +399,7 @@ cdef class Tokenizer:
                                   has_special, with_special_cases)
        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                            with_special_cases)
        if len(self._cache) < self.max_cache_size:
            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                              tokens.length - orig_size)
@ -507,8 +517,7 @@ cdef class Tokenizer:
        if n <= 0:
            # avoid mem alloc of zero length
            return 0
-        for i in range(n):
+        if self.vocab.in_memory_zone:
            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
            return 0
        # See #1250
        if has_special[0]:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -667,7 +667,8 @@ cdef class Doc:
        else:
            return False
-    property vector:
+    @property
    def vector(self):
        """A real-valued meaning representation. Defaults to an average of the
        token vectors.
@ -676,7 +677,6 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#vector
        """
        def __get__(self):
        if "vector" in self.user_hooks:
            return self.user_hooks["vector"](self)
        if self._vector is not None:
@ -694,17 +694,18 @@ cdef class Doc:
        else:
            return xp.zeros((self.vocab.vectors_length,), dtype="float32")
-        def __set__(self, value):
+    @vector.setter
    def vector(self, value):
        self._vector = value
-    property vector_norm:
+    @property
    def vector_norm(self):
        """The L2 norm of the document's vector representation.
        RETURNS (float): The L2 norm of the vector representation.
        DOCS: https://spacy.io/api/doc#vector_norm
        """
        def __get__(self):
        if "vector_norm" in self.user_hooks:
            return self.user_hooks["vector_norm"](self)
        cdef float value
@ -716,7 +717,8 @@ cdef class Doc:
            self._vector_norm = sqrt(norm) if norm != 0 else 0
        return self._vector_norm
-        def __set__(self, value):
+    @vector_norm.setter
    def vector_norm(self, value):
        self._vector_norm = value
    @property
@ -736,7 +738,8 @@ cdef class Doc:
        """
        return self.text
-    property ents:
+    @property
    def ents(self):
        """The named entities in the document. Returns a tuple of named entity
        `Span` objects, if the entity recognizer has been applied.
@ -744,7 +747,6 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#ents
        """
        def __get__(self):
        cdef int i
        cdef const TokenC* token
        cdef int start = -1
@ -779,7 +781,8 @@ cdef class Doc:
        output = [o for o in output if o.label_ != ""]
        return tuple(output)
-        def __set__(self, ents):
+    @ents.setter
    def ents(self, ents):
        # TODO:
        # 1. Test basic data-driven ORTH gazetteer
        # 2. Test more nuanced date and currency regex
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -757,77 +757,86 @@ cdef class Span:
        for word in self.rights:
            yield from word.subtree
-    property start:
+    @property
-        def __get__(self):
+    def start(self):
        return self.c.start
-        def __set__(self, int start):
+    @start.setter
    def start(self, int start):
        if start < 0:
            raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
        self.c.start = start
-    property end:
+    @property
-        def __get__(self):
+    def end(self):
        return self.c.end
-        def __set__(self, int end):
+    @end.setter
    def end(self, int end):
        if end < 0:
            raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
        self.c.end = end
-    property start_char:
+    @property
-        def __get__(self):
+    def start_char(self):
        return self.c.start_char
-        def __set__(self, int start_char):
+    @start_char.setter
    def start_char(self, int start_char):
        if start_char < 0:
            raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
        self.c.start_char = start_char
-    property end_char:
+    @property
-        def __get__(self):
+    def end_char(self):
        return self.c.end_char
-        def __set__(self, int end_char):
+    @end_char.setter
    def end_char(self, int end_char):
        if end_char < 0:
            raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
        self.c.end_char = end_char
-    property label:
+    @property
-        def __get__(self):
+    def label(self):
        return self.c.label
-        def __set__(self, attr_t label):
+    @label.setter
    def label(self, attr_t label):
        self.c.label = label
-    property kb_id:
+    @property
-        def __get__(self):
+    def kb_id(self):
        return self.c.kb_id
-        def __set__(self, attr_t kb_id):
+    @kb_id.setter
    def kb_id(self, attr_t kb_id):
        self.c.kb_id = kb_id
-    property id:
+    @property
-        def __get__(self):
+    def id(self):
        return self.c.id
-        def __set__(self, attr_t id):
+    @id.setter
    def id(self, attr_t id):
        self.c.id = id
-    property ent_id:
+    @property
    def ent_id(self):
        """RETURNS (uint64): The entity ID."""
        def __get__(self):
        return self.root.ent_id
-        def __set__(self, hash_t key):
+    @ent_id.setter
    def ent_id(self, hash_t key):
        raise NotImplementedError(Errors.E200.format(attr="ent_id"))
-    property ent_id_:
+    @property
    def ent_id_(self):
        """RETURNS (str): The (string) entity ID."""
        def __get__(self):
        return self.root.ent_id_
-        def __set__(self, str key):
+    @ent_id_.setter
    def ent_id_(self, str key):
        raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
    @property
@ -843,28 +852,31 @@ cdef class Span:
        """RETURNS (str): The span's lemma."""
        return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
-    property label_:
+    @property
    def label_(self):
        """RETURNS (str): The span's label."""
        def __get__(self):
        return self.doc.vocab.strings[self.label]
-        def __set__(self, str label_):
+    @label_.setter
    def label_(self, str label_):
        self.label = self.doc.vocab.strings.add(label_)
-    property kb_id_:
+    @property
    def kb_id_(self):
        """RETURNS (str): The span's KB ID."""
        def __get__(self):
        return self.doc.vocab.strings[self.kb_id]
-        def __set__(self, str kb_id_):
+    @kb_id_.setter
    def kb_id_(self, str kb_id_):
        self.kb_id = self.doc.vocab.strings.add(kb_id_)
-    property id_:
+    @property
    def id_(self):
        """RETURNS (str): The span's ID."""
        def __get__(self):
        return self.doc.vocab.strings[self.id]
-        def __set__(self, str id_):
+    @id_.setter
    def id_(self, str id_):
        self.id = self.doc.vocab.strings.add(id_)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -249,11 +249,12 @@ cdef class Token:
        """
        return not self.c.morph == 0
-    property morph:
+    @property
-        def __get__(self):
+    def morph(self):
        return MorphAnalysis.from_id(self.vocab, self.c.morph)
-        def __set__(self, MorphAnalysis morph):
+    @morph.setter
    def morph(self, MorphAnalysis morph):
        # Check that the morph has the same vocab
        if self.vocab != morph.vocab:
            raise ValueError(Errors.E1013)
@ -377,38 +378,42 @@ cdef class Token:
        """
        return self.c.lex.suffix
-    property lemma:
+    @property
    def lemma(self):
        """RETURNS (uint64): ID of the base form of the word, with no
            inflectional suffixes.
        """
        def __get__(self):
        return self.c.lemma
-        def __set__(self, attr_t lemma):
+    @lemma.setter
    def lemma(self, attr_t lemma):
        self.c.lemma = lemma
-    property pos:
+    @property
    def pos(self):
        """RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
        def __get__(self):
        return self.c.pos
-        def __set__(self, pos):
+    @pos.setter
    def pos(self, pos):
        self.c.pos = pos
-    property tag:
+    @property
    def tag(self):
        """RETURNS (uint64): ID of fine-grained part-of-speech tag."""
        def __get__(self):
        return self.c.tag
-        def __set__(self, attr_t tag):
+    @tag.setter
    def tag(self, attr_t tag):
        self.c.tag = tag
-    property dep:
+    @property
    def dep(self):
        """RETURNS (uint64): ID of syntactic dependency label."""
        def __get__(self):
        return self.c.dep
-        def __set__(self, attr_t label):
+    @dep.setter
    def dep(self, attr_t label):
        self.c.dep = label
    @property
@ -494,8 +499,8 @@ cdef class Token:
            return self.doc.user_token_hooks["sent"](self)
        return self.doc[self.i : self.i+1].sent
-    property sent_start:
+    @property
-        def __get__(self):
+    def sent_start(self):
        """Deprecated: use Token.is_sent_start instead."""
        # Raising a deprecation warning here causes errors for autocomplete
        # Handle broken backwards compatibility case: doc[0].sent_start
@ -505,17 +510,18 @@ cdef class Token:
        else:
            return self.c.sent_start
-        def __set__(self, value):
+    @sent_start.setter
    def sent_start(self, value):
        self.is_sent_start = value
-    property is_sent_start:
+    @property
    def is_sent_start(self):
        """A boolean value indicating whether the token starts a sentence.
        `None` if unknown. Defaults to `True` for the first token in the `Doc`.
        RETURNS (bool / None): Whether the token starts a sentence.
            None if unknown.
        """
        def __get__(self):
        if self.c.sent_start == 0:
            return None
        elif self.c.sent_start < 0:
@ -523,7 +529,8 @@ cdef class Token:
        else:
            return True
-        def __set__(self, value):
+    @is_sent_start.setter
    def is_sent_start(self, value):
        if self.doc.has_annotation("DEP"):
            raise ValueError(Errors.E043)
        if value is None:
@ -535,7 +542,8 @@ cdef class Token:
        else:
            raise ValueError(Errors.E044.format(value=value))
-    property is_sent_end:
+    @property
    def is_sent_end(self):
        """A boolean value indicating whether the token ends a sentence.
        `None` if unknown. Defaults to `True` for the last token in the `Doc`.
@ -544,7 +552,6 @@ cdef class Token:
        DOCS: https://spacy.io/api/token#is_sent_end
        """
        def __get__(self):
        if self.i + 1 == len(self.doc):
            return True
        elif self.doc[self.i+1].is_sent_start is None:
@ -554,7 +561,8 @@ cdef class Token:
        else:
            return False
-        def __set__(self, value):
+    @is_sent_end.setter
    def is_sent_end(self, value):
        raise ValueError(Errors.E196)
    @property
@ -682,20 +690,21 @@ cdef class Token:
        """
        return not Token.missing_head(self.c)
-    property head:
+    @property
    def head(self):
        """The syntactic parent, or "governor", of this token.
        If token.has_head() is `False`, this method will return itself.
        RETURNS (Token): The token predicted by the parser to be the head of
            the current token.
        """
        def __get__(self):
        if not self.has_head():
            return self
        else:
            return self.doc[self.i + self.c.head]
-        def __set__(self, Token new_head):
+    @head.setter
    def head(self, Token new_head):
        # This function sets the head of self to new_head and updates the
        # counters for left/right dependents and left/right corner for the
        # new and the old head
@ -744,20 +753,22 @@ cdef class Token:
                    queue.append(child)
        return tuple([w for w in output if w.i != self.i])
-    property ent_type:
+    @property
    def ent_type(self):
        """RETURNS (uint64): Named entity type."""
        def __get__(self):
        return self.c.ent_type
-        def __set__(self, ent_type):
+    @ent_type.setter
    def ent_type(self, ent_type):
        self.c.ent_type = ent_type
-    property ent_type_:
+    @property
    def ent_type_(self):
        """RETURNS (str): Named entity type."""
        def __get__(self):
        return self.vocab.strings[self.c.ent_type]
-        def __set__(self, ent_type):
+    @ent_type_.setter
    def ent_type_(self, ent_type):
        self.c.ent_type = self.vocab.strings.add(ent_type)
    @property
@ -784,40 +795,44 @@ cdef class Token:
        """
        return self.iob_strings()[self.c.ent_iob]
-    property ent_id:
+    @property
    def ent_id(self):
        """RETURNS (uint64): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
        return self.c.ent_id
-        def __set__(self, hash_t key):
+    @ent_id.setter
    def ent_id(self, hash_t key):
        self.c.ent_id = key
-    property ent_id_:
+    @property
    def ent_id_(self):
        """RETURNS (str): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
        return self.vocab.strings[self.c.ent_id]
-        def __set__(self, name):
+    @ent_id_.setter
    def ent_id_(self, name):
        self.c.ent_id = self.vocab.strings.add(name)
-    property ent_kb_id:
+    @property
    def ent_kb_id(self):
        """RETURNS (uint64): Named entity KB ID."""
        def __get__(self):
        return self.c.ent_kb_id
-        def __set__(self, attr_t ent_kb_id):
+    @ent_kb_id.setter
    def ent_kb_id(self, attr_t ent_kb_id):
        self.c.ent_kb_id = ent_kb_id
-    property ent_kb_id_:
+    @property
    def ent_kb_id_(self):
        """RETURNS (str): Named entity KB ID."""
        def __get__(self):
        return self.vocab.strings[self.c.ent_kb_id]
-        def __set__(self, ent_kb_id):
+    @ent_kb_id_.setter
    def ent_kb_id_(self, ent_kb_id):
        self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
    @property
@ -840,15 +855,16 @@ cdef class Token:
        """
        return self.vocab.strings[self.c.lex.lower]
-    property norm_:
+    @property
    def norm_(self):
        """RETURNS (str): The token's norm, i.e. a normalised form of the
            token text. Usually set in the language's tokenizer exceptions or
            norm exceptions.
        """
        def __get__(self):
        return self.vocab.strings[self.norm]
-        def __set__(self, str norm_):
+    @norm_.setter
    def norm_(self, str norm_):
        self.c.norm = self.vocab.strings.add(norm_)
    @property
@ -879,32 +895,35 @@ cdef class Token:
        """
        return self.vocab.strings[self.c.lex.lang]
-    property lemma_:
+    @property
    def lemma_(self):
        """RETURNS (str): The token lemma, i.e. the base form of the word,
            with no inflectional suffixes.
        """
        def __get__(self):
        return self.vocab.strings[self.c.lemma]
-        def __set__(self, str lemma_):
+    @lemma_.setter
    def lemma_(self, str lemma_):
        self.c.lemma = self.vocab.strings.add(lemma_)
-    property pos_:
+    @property
    def pos_(self):
        """RETURNS (str): Coarse-grained part-of-speech tag."""
        def __get__(self):
        return parts_of_speech.NAMES[self.c.pos]
-        def __set__(self, pos_name):
+    @pos_.setter
    def pos_(self, pos_name):
        if pos_name not in parts_of_speech.IDS:
            raise ValueError(Errors.E1021.format(pp=pos_name))
        self.c.pos = parts_of_speech.IDS[pos_name]
-    property tag_:
+    @property
    def tag_(self):
        """RETURNS (str): Fine-grained part-of-speech tag."""
        def __get__(self):
        return self.vocab.strings[self.c.tag]
-        def __set__(self, tag):
+    @tag_.setter
    def tag_(self, tag):
        self.tag = self.vocab.strings.add(tag)
    def has_dep(self):
@ -915,12 +934,13 @@ cdef class Token:
        """
        return not Token.missing_dep(self.c)
-    property dep_:
+    @property
    def dep_(self):
        """RETURNS (str): The syntactic dependency label."""
        def __get__(self):
        return self.vocab.strings[self.c.dep]
-        def __set__(self, str label):
+    @dep_.setter
    def dep_(self, str label):
        self.c.dep = self.vocab.strings.add(label)
    @property
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -88,20 +88,22 @@ cdef class Example:
    def __len__(self):
        return len(self.predicted)
-    property predicted:
+    @property
-        def __get__(self):
+    def predicted(self):
        return self.x
-        def __set__(self, doc):
+    @predicted.setter
    def predicted(self, doc):
        self.x = doc
        self._cached_alignment = None
        self._cached_words_x = [t.text for t in doc]
-    property reference:
+    @property
-        def __get__(self):
+    def reference(self):
        return self.y
-        def __set__(self, doc):
+    @reference.setter
    def reference(self, doc):
        self.y = doc
        self._cached_alignment = None
        self._cached_words_y = [t.text for t in doc]
@ -420,8 +422,8 @@ cdef class Example:
                seen_indices.update(indices)
        return output
-    property text:
+    @property
-        def __get__(self):
+    def text(self):
        return self.x.text
    def __str__(self):
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -41,7 +41,9 @@ cdef class Vocab:
    cdef const TokenC* make_fused_token(self, substrings) except NULL
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
    cdef PreshMap _by_orth
    cdef Pool _non_temp_mem
    cdef vector[attr_t] _transient_orths
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -1,6 +1,8 @@
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
 from cymem.cymem import Pool
 from thinc.types import Floats1d, FloatsXd
 from . import Language
@ -67,6 +69,8 @@ class Vocab:
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
    ) -> Vocab: ...
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
 def pickle_vocab(vocab: Vocab) -> Any: ...
 def unpickle_vocab(
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,8 +1,11 @@
 import functools
 from contextlib import ExitStack, contextmanager
 from typing import Iterator, Optional
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
 from preshed.maps cimport map_clear
 from .attrs cimport LANG, ORTH
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,15 +90,22 @@ cdef class Vocab:
        self.lookups = lookups
        self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
        # During a memory_zone we replace our mem object with one
        # that's passed to us. We keep a reference to our non-temporary
        # memory here, in case we need to make an allocation we want to
        # guarantee is not temporary. This is also how we check whether
        # we're in a memory zone: we check whether self.mem is self._non_temp_mem
        self._non_temp_mem = self.mem
-    property vectors:
+    @property
-        def __get__(self):
+    def vectors(self):
        return self._vectors
-        def __set__(self, vectors):
+    @vectors.setter
    def vectors(self, vectors):
        if hasattr(vectors, "strings"):
            for s in vectors.strings:
-                    self.strings.add(s)
+                self.strings.add(s, allow_transient=False)
        self._vectors = vectors
        self._vectors.strings = self.strings
@ -106,6 +116,10 @@ cdef class Vocab:
            langfunc = self.lex_attr_getters.get(LANG, None)
        return langfunc("_") if langfunc else ""
    @property
    def in_memory_zone(self) -> bool:
        return self.mem is not self._non_temp_mem
    def __len__(self):
        """The current number of lexemes stored.
@ -113,6 +127,33 @@ cdef class Vocab:
        """
        return self.length
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
        """Begin a block where resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        """
        if mem is None:
            mem = Pool()
        # The ExitStack allows programmatic nested context managers.
        # We don't know how many we need, so it would be awkward to have
        # them as nested blocks.
        with ExitStack() as stack:
            contexts = [stack.enter_context(self.strings.memory_zone(mem))]
            if hasattr(self.morphology, "memory_zone"):
                contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
            if hasattr(self._vectors, "memory_zone"):
                contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
            self.mem = mem
            yield mem
        self._clear_transient_orths()
        self.mem = self._non_temp_mem
    def add_flag(self, flag_getter, int flag_id=-1):
        """Set a new boolean flag to words in the vocabulary.
@ -147,8 +188,7 @@ cdef class Vocab:
    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
-        `Lexeme` if necessary using memory acquired from the given pool. If the
+        `Lexeme` if necessary.
        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if string == "":
            return &EMPTY_LEXEME
@ -179,19 +219,11 @@ cdef class Vocab:
            return self._new_lexeme(mem, self.strings[orth])
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
+        # The mem argument is deprecated, replaced by memory zones. Same with
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
+        # this size heuristic.
        # was originally supposed to work. The best solution to the growing
        # memory use is to periodically reset the vocab, which is an action
        # that should be up to the user to do (so we don't need to keep track
        # of the doc ownership).
        # TODO: Change the C API so that the mem isn't passed in here.
        mem = self.mem
        # if len(string) < 3 or self.length < 10000:
        #    mem = self.mem
        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
-        lex.orth = self.strings.add(string)
+        lex.orth = self.strings.add(string, allow_transient=True)
        lex.length = len(string)
        if self.vectors is not None and hasattr(self.vectors, "key2row"):
            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -201,18 +233,25 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, str):
-                    value = self.strings.add(value)
+                    value = self.strings.add(value, allow_transient=True)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
+        self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
            self._add_lex_to_vocab(lex.orth, lex)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1
        if is_transient and self.in_memory_zone:
            self._transient_orths.push_back(lex.orth)
    def _clear_transient_orths(self):
        """Remove transient lexemes from the index (generally at the end of the memory zone)"""
        for orth in self._transient_orths:
            map_clear(self._by_orth.c_map, orth)
        self._transient_orths.clear()
    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.
@ -264,7 +303,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if isinstance(id_or_string, str):
-            orth = self.strings.add(id_or_string)
+            orth = self.strings.add(id_or_string, allow_transient=True)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -416,7 +455,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#get_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.has_vector(key):
@ -435,7 +474,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#set_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=False)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.vectors.is_full and key not in self.vectors:
@ -459,16 +498,17 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#has_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        return key in self.vectors
-    property lookups:
+    @property
-        def __get__(self):
+    def lookups(self):
        return self._lookups
-        def __set__(self, lookups):
+    @lookups.setter
    def lookups(self, lookups):
        self._lookups = lookups
        if lookups.has_table("lexeme_norm"):
            self.lex_attr_getters[NORM] = util.add_lookups(
--- a/website/docs/api/attributes.mdx
+++ b/website/docs/api/attributes.mdx
@ -46,10 +46,10 @@ as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
 appending `_` as in `token.dep_`.
 | Attribute    | Description                                                                                                                                                    |
-| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `DEP`        | The token's dependency label. ~~str~~                                                                                                                          |
 | `ENT_ID`     | The token's entity ID (`ent_id`). ~~str~~                                                                                                                      |
-| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
+| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
 | `ENT_KB_ID`  | The token's entity knowledge base ID. ~~str~~                                                                                                                  |
 | `ENT_TYPE`   | The token's entity label. ~~str~~                                                                                                                              |
 | `IS_ALPHA`   | Token text consists of alphabetic characters. ~~bool~~                                                                                                         |
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
 (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
 ✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
+✔ Examples without occurrences available for all labels
 ✔ No entities consisting of or starting/ending with whitespace
 =========================== Part-of-speech Tagging ===========================
@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
-Runs prediction trials for a trained model with varying tresholds to maximize
+Runs prediction trials for a trained model with varying thresholds to maximize
 the specified metric. The search space for the threshold is traversed linearly
 from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
 (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters.
 | `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
 | `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
 | `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
+| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~                                                                                                                            |
 | `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
 | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
 | `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
 | `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
 | `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
-| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
+| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                     |
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -101,7 +101,7 @@ custom knowledge base, you should either call
 [`initialize`](/api/entitylinker#initialize) call.
 | Name                                     | Description                                                                                                                                                                                                                                                                                  |
-| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                             |
 | `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                                                                                                                                                                                    |
 | `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                          |
@ -114,7 +114,7 @@ custom knowledge base, you should either call
 | `incl_context`                           | Whether or not to include the local context in the model. ~~bool~~                                                                                                                                                                                                                           |
 | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                     |
 | `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                      |
-| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
 ## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using
 [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
 with `overwrite_ents=True`, existing entities will be replaced if they overlap
 with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
+longer patterns over shorter, and if equal the match occurring first in the Doc
 is chosen.
 > #### Example
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@ -148,8 +148,9 @@ Whether a feature/value pair is in the analysis.
 > ```
 | Name         | Description                                                           |
-| ----------- | --------------------------------------------- |
+| ------------ | --------------------------------------------------------------------- |
-| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
+| `feature`    | A feature/value pair. ~~str~~                                         |
 | **RETURNS**  | Whether the feature/value pair is contained in the analysis. ~~bool~~ |
 ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative
 clauses.
 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
-has not been implemeted for the given language, a `NotImplementedError` is
+has not been implemented for the given language, a `NotImplementedError` is
 raised.
 > #### Example
--- a/website/docs/api/transformer.mdx
+++ b/website/docs/api/transformer.mdx
@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
 | `align`        | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                 |
 | `width`        | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                          |
-### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
+### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
 Create an empty `TransformerData` container.
--- a/website/docs/usage/layers-architectures.mdx
+++ b/website/docs/usage/layers-architectures.mdx
@ -832,7 +832,7 @@ retrieve and add to them.
 After creation, the component needs to be
 [initialized](/usage/training#initialization). This method can define the
-relevant labels in two ways: explicitely by setting the `labels` argument in the
+relevant labels in two ways: explicitly by setting the `labels` argument in the
 [`initialize` block](/api/data-formats#config-initialize) of the config, or
 implicately by deducing them from the `get_examples` callback that generates the
 full **training data set**, or a representative sample.
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@ -1899,7 +1899,7 @@ the two words.
    "Shore": ("coast", 0.732257),
    "Precautionary": ("caution", 0.490973),
    "hopelessness": ("sadness", 0.742366),
-    "Continous": ("continuous", 0.732549),
+    "Continuous": ("continuous", 0.732549),
    "Disemboweled": ("corpse", 0.499432),
    "biostatistician": ("scientist", 0.339724),
    "somewheres": ("somewheres", 0.402736),
--- a/website/docs/usage/projects.mdx
+++ b/website/docs/usage/projects.mdx
@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
 dependency check, set `check_requirements: false` in your project's
 `project.yml`.
-### 4. Run a workflow {id="run-workfow"}
+### 4. Run a workflow {id="run-workflow"}
 > #### project.yml
 >
@ -286,7 +286,7 @@ pipelines.
 | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `title`                                             | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | `description`                                       | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `vars`                                              | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                       |
+| `vars`                                              | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                      |
 | `env`                                               | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`.                                                                                                                                                                                                                                                                                          |
 | `directories`                                       | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
 | `assets`                                            | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@ -306,7 +306,9 @@ installed in the same environment – that's it.
 ### Loading probability tables into existing models
-You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
+You can load a probability table from
 [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
 existing spaCy model like `en_core_web_sm`.
 ```python
 # Requirements: pip install spacy-lookups-data
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
 nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
 ```
-When training a model from scratch you can also specify probability tables in the `config.cfg`.
+When training a model from scratch you can also specify probability tables in
 the `config.cfg`.
 ```ini {title="config.cfg (excerpt)"}
 [initialize.lookups]
@ -346,8 +349,8 @@ them**!
 To stick with the theme of
 [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
 consider the following custom spaCy
-[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
+[pipeline component](/usage/processing-pipelines#custom-components) that prints
-snake when it's called:
+a snake when it's called:
 > #### Package directory structure
 >
--- a/website/docs/usage/v2-2.mdx
+++ b/website/docs/usage/v2-2.mdx
@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
 (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
 ✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
+✔ Examples without occurrences available for all labels
 ✔ No entities consisting of or starting/ending with whitespace
 =========================== Part-of-speech Tagging ===========================
--- a/website/docs/usage/v3-2.mdx
+++ b/website/docs/usage/v3-2.mdx
@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
 `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
 in the [transformer API docs](/api/architectures#TransformerModel).
-`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
+`spacy-transformers` v1.1 also adds support for `transformer_config` settings
 such as `output_attentions`. Additional output is stored under
 `TransformerData.model_output`. More details are in the
 [TransformerModel docs](/api/architectures#TransformerModel). The training speed
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -31,6 +31,12 @@
            "name": "Bengali",
            "has_examples": true
        },
        {
            "code": "bo",
            "name": "Tibetan",
            "example": "འདི་ཚིག་གྲུབ་རེད།",
            "has_examples": true
        },
        {
            "code": "ca",
            "name": "Catalan",
@ -480,6 +486,12 @@
            ],
            "example": "这是一个用于示例的句子。",
            "has_examples": true
        },
        {
            "code": "kmr",
            "name": "Kurdish Kurmanji",
            "example": "Ev hevokek e",
            "has_examples": true
        }
    ],
    "licenses": [
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }
 const navAlert = (
-    <Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
+    <Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
-        💥 Interested in <strong>Premium spaCy Models</strong>?
+        💥 <strong>New:</strong> Case study with S&P Global
    </Link>
 )