Merge branch 'master' into pr/13515

2025-07-13 01:32:32 +03:00 · 2024-09-10 14:21:11 +02:00 · 2024-09-10 14:21:11 +02:00 · 5cbe621cde
commit 5cbe621cde
parent 52822ae280 0190e669c5
41 changed files with 4507 additions and 369 deletions
--- a/.github/workflows/cibuildwheel.yml
+++ b/.github/workflows/cibuildwheel.yml
@ -0,0 +1,92 @@
+name: Build
+
+on:
+  push:
+    tags:
+      # ytf did they invent their own syntax that's almost regex?
+      # ** matches 'zero or more of any character'
+      - 'release-v[0-9]+.[0-9]+.[0-9]+**'
+      - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        # macos-13 is an intel runner, macos-14 is apple silicon
+        os: [ubuntu-latest, windows-latest, macos-13]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.19.1
+        env:
+          CIBW_SOME_OPTION: value
+        with:
+          package-dir: .
+          output-dir: wheelhouse
+          config-file: "{package}/pyproject.toml"
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
+          path: ./wheelhouse/*.whl
+
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build sdist
+        run: pipx run build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-sdist
+          path: dist/*.tar.gz
+  create_release:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      checks: write
+      actions: read
+      issues: read
+      packages: write
+      pull-requests: read
+      repository-projects: read
+      statuses: read
+    steps:
+      - name: Get the tag name and determine if it's a prerelease
+        id: get_tag_info
+        run: |
+          FULL_TAG=${GITHUB_REF#refs/tags/}
+          if [[ $FULL_TAG == release-* ]]; then
+            TAG_NAME=${FULL_TAG#release-}
+            IS_PRERELEASE=false
+          elif [[ $FULL_TAG == prerelease-* ]]; then
+            TAG_NAME=${FULL_TAG#prerelease-}
+            IS_PRERELEASE=true
+          else
+            echo "Tag does not match expected patterns" >&2
+            exit 1
+          fi
+          echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
+          echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
+          echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          # unpacks all CIBW artifacts into dist/
+          pattern: cibw-*
+          path: dist
+          merge-multiple: true
+      - name: Create Draft Release
+        id: create_release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          name: ${{ env.TAG_NAME }}
+          draft: true
+          prerelease: ${{ env.IS_PRERELEASE }}
+          files: "./dist/*" 
--- a/.github/workflows/gputests.yml.disabled
+++ b/.github/workflows/gputests.yml.disabled
--- a/.github/workflows/publish_pypi.yml
+++ b/.github/workflows/publish_pypi.yml
@ -0,0 +1,29 @@
+# The cibuildwheel action triggers on creation of a release, this
+# triggers on publication.
+# The expected workflow is to create a draft release and let the wheels
+# upload, and then hit 'publish', which uploads to PyPi.
+
+on:
+  release:
+    types:
+      - published
+
+jobs:
+  upload_pypi:
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/spacy
+    permissions:
+      id-token: write
+      contents: read
+    if: github.event_name == 'release' && github.event.action == 'published'
+    # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
+    # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    steps:
+      - uses: robinraju/release-downloader@v1
+        with:
+          tag: ${{ github.event.release.tag_name }}
+          fileName: '*'
+          out-file-path: 'dist'
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/slowtests.yml.disabled
+++ b/.github/workflows/slowtests.yml.disabled
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,5 +11,58 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"

+[tool.cibuildwheel]
+build = "*"
+skip = "pp* cp36* cp37* cp38* *-win32"
+test-skip = ""
+free-threaded-support = false
+
+archs = ["native"]
+
+build-frontend = "default"
+config-settings = {}
+dependency-versions = "pinned"
+environment = { PIP_CONSTRAINT = "build-constraints.txt" }
+
+environment-pass = []
+build-verbosity = 0
+
+before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
+before-build = "pip install -r requirements.txt && python setup.py clean"
+repair-wheel-command = ""
+
+test-command = ""
+before-test = ""
+test-requires = []
+test-extras = []
+
+container-engine = "docker"
+
+manylinux-x86_64-image = "manylinux2014"
+manylinux-i686-image = "manylinux2014"
+manylinux-aarch64-image = "manylinux2014"
+manylinux-ppc64le-image = "manylinux2014"
+manylinux-s390x-image = "manylinux2014"
+manylinux-pypy_x86_64-image = "manylinux2014"
+manylinux-pypy_i686-image = "manylinux2014"
+manylinux-pypy_aarch64-image = "manylinux2014"
+
+musllinux-x86_64-image = "musllinux_1_2"
+musllinux-i686-image = "musllinux_1_2"
+musllinux-aarch64-image = "musllinux_1_2"
+musllinux-ppc64le-image = "musllinux_1_2"
+musllinux-s390x-image = "musllinux_1_2"
+
+[tool.cibuildwheel.linux]
+repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
+
+[tool.cibuildwheel.macos]
+repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
+
+[tool.cibuildwheel.windows]
+
+[tool.cibuildwheel.pyodide]
+
+
 [tool.isort]
 profile = "black"
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -66,7 +66,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.5"
+__version__ = "3.8.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/lang/bo/init.py
+++ b/spacy/lang/bo/init.py
@ -0,0 +1,16 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+
+
+class TibetanDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+
+
+class Tibetan(Language):
+    lang = "bo"
+    Defaults = TibetanDefaults
+
+
+__all__ = ["Tibetan"]
--- a/spacy/lang/bo/examples.py
+++ b/spacy/lang/bo/examples.py
@ -0,0 +1,16 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.bo.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
+    "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
+    "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
+    "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
+    "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
+    "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
+]
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -0,0 +1,65 @@
+from ...attrs import LIKE_NUM
+
+# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
+
+_num_words = [
+    "ཀླད་ཀོར་",
+    "གཅིག་",
+    "གཉིས་",
+    "གསུམ་",
+    "བཞི་",
+    "ལྔ་",
+    "དྲུག་",
+    "བདུན་",
+    "བརྒྱད་",
+    "དགུ་",
+    "བཅུ་",
+    "བཅུ་གཅིག་",
+    "བཅུ་གཉིས་",
+    "བཅུ་གསུམ་",
+    "བཅུ་བཞི་",
+    "བཅུ་ལྔ་",
+    "བཅུ་དྲུག་",
+    "བཅུ་བདུན་",
+    "བཅུ་པརྒྱད",
+    "བཅུ་དགུ་",
+    "ཉི་ཤུ་",
+    "སུམ་ཅུ",
+    "བཞི་བཅུ",
+    "ལྔ་བཅུ",
+    "དྲུག་ཅུ",
+    "བདུན་ཅུ",
+    "བརྒྱད་ཅུ",
+    "དགུ་བཅུ",
+    "བརྒྱ་",
+    "སྟོང་",
+    "ཁྲི་",
+    "ས་ཡ་",
+    "	བྱེ་བ་",
+    "དུང་ཕྱུར་",
+    "ཐེར་འབུམ་",
+    "ཐེར་འབུམ་ཆེན་པོ་",
+    "ཁྲག་ཁྲིག་",
+    "ཁྲག་ཁྲིག་ཆེན་པོ་",
+]
+
+
+def like_num(text):
+    """
+    Check if text resembles a number
+    """
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/bo/stop_words.py
+++ b/spacy/lang/bo/stop_words.py
@ -0,0 +1,198 @@
+# Source: https://zenodo.org/records/10148636
+
+STOP_WORDS = set(
+    """
+འི་
+།
+དུ་
+གིས་
+སོགས་
+ཏེ
+གི་
+རྣམས་
+ནི
+ཀུན་
+ཡི་
+འདི
+ཀྱི་
+སྙེད་
+པས་
+གཞན་
+ཀྱིས་
+ཡི
+ལ
+ནི་
+དང་
+སོགས
+ཅིང་
+ར
+དུ
+མི་
+སུ་
+བཅས་
+ཡོངས་
+ལས
+ཙམ་
+གྱིས་
+དེ་
+ཡང་
+མཐའ་དག་
+ཏུ་
+ཉིད་
+ས
+ཏེ་
+གྱི་
+སྤྱི
+དེ
+ཀ་
+ཡིན་
+ཞིང་
+འདི་
+རུང་
+རང་
+ཞིག་
+སྟེ
+སྟེ་
+ན་རེ
+ངམ
+ཤིང་
+དག་
+ཏོ
+རེ་
+འང་
+ཀྱང་
+ལགས་པ
+ཚུ
+དོ
+ཡིན་པ
+རེ
+ན་རེ་
+ཨེ་
+ཚང་མ
+ཐམས་ཅད་
+དམ་
+འོ་
+ཅིག་
+གྱིན་
+ཡིན
+ན
+ཁོ་ན་
+འམ་
+ཀྱིན་
+ལོ
+ཀྱིས
+བས་
+ལགས་
+ཤིག
+གིས
+ཀི་
+སྣ་ཚོགས་
+རྣམས
+སྙེད་པ
+ཡིས་
+གྱི
+གི
+བམ་
+ཤིག་
+རེ་རེ་
+ནམ
+མིན་
+ནམ་
+ངམ་
+རུ་
+འགའ་
+ཀུན
+ཤས་
+ཏུ
+ཡིས
+གིན་
+གམ་
+འོ
+ཡིན་པ་
+མིན
+ལགས
+གྱིས
+ཅང་
+འགའ
+སམ་
+ཞིག
+འང
+ལས་ཆེ་
+འཕྲལ་
+བར་
+རུ
+དང
+ཡ
+འག
+སམ
+ཀ
+ཅུང་ཟད་
+ཅིག
+ཉིད
+དུ་མ
+མ
+ཡིན་བ
+འམ
+མམ
+དམ
+དག
+ཁོ་ན
+ཀྱི
+ལམ
+ཕྱི་
+ནང་
+ཙམ
+ནོ་
+སོ་
+རམ་
+བོ་
+ཨང་
+ཕྱི
+ཏོ་
+ཚོ
+ལ་ལ་
+ཚོ་
+ཅིང
+མ་གི་
+གེ
+གོ
+ཡིན་ལུགས་
+རོ་
+བོ
+ལགས་པ་
+པས
+རབ་
+འི
+རམ
+བས
+གཞན
+སྙེད་པ་
+འབའ་
+མཾ་
+པོ
+ག་
+ག
+གམ
+སྤྱི་
+བམ
+མོ་
+ཙམ་པ་
+ཤ་སྟག་
+མམ་
+རེ་རེ
+སྙེད
+ཏམ་
+ངོ
+གྲང་
+ཏ་རེ
+ཏམ
+ཁ་
+ངེ་
+ཅོག་
+རིལ་
+ཉུང་ཤས་
+གིང་
+ཚ་
+ཀྱང
+""".split()
+)
--- a/spacy/lang/gd/init.py
+++ b/spacy/lang/gd/init.py
@ -0,0 +1,18 @@
+from typing import Optional
+
+from ...language import BaseDefaults, Language
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+class ScottishDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
+
+
+class Scottish(Language):
+    lang = "gd"
+    Defaults = ScottishDefaults
+
+
+__all__ = ["Scottish"]
--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -0,0 +1,388 @@
+STOP_WORDS = set(
+    """
+'ad
+'ar
+'d # iad
+'g # ag
+'ga
+'gam
+'gan
+'gar
+'gur
+'m # am
+'n # an
+'n seo
+'na
+'nad
+'nam
+'nan
+'nar
+'nuair
+'nur
+'s
+'sa
+'san
+'sann
+'se
+'sna
+a
+a'
+a'd # agad
+a'm # agam
+a-chèile
+a-seo
+a-sin
+a-siud
+a chionn
+a chionn 's
+a chèile
+a chéile
+a dh'
+a h-uile
+a seo
+ac' # aca
+aca
+aca-san
+acasan
+ach
+ag
+agad
+agad-sa
+agads'
+agadsa
+agaibh
+agaibhse
+againn
+againne
+agam
+agam-sa
+agams'
+agamsa
+agus
+aice
+aice-se
+aicese
+aig
+aig' # aige
+aige
+aige-san
+aigesan
+air
+air-san
+air neo
+airsan
+am
+an
+an seo
+an sin
+an siud
+an uair
+ann
+ann a
+ann a'
+ann a shin
+ann am
+ann an
+annad
+annam
+annam-s'
+annamsa
+anns
+anns an
+annta
+aon
+ar
+as
+asad
+asda
+asta
+b'
+bho
+bhon
+bhuaidhe # bhuaithe
+bhuainn
+bhuaipe
+bhuaithe
+bhuapa
+bhur
+brì
+bu
+c'à
+car son
+carson
+cha
+chan
+chionn
+choir
+chon
+chun
+chèile
+chéile
+chòir
+cia mheud
+ciamar
+co-dhiubh
+cuide
+cuin
+cuin'
+cuine
+cà
+cà'
+càil
+càit
+càit'
+càite
+cò
+cò mheud
+có
+d'
+da
+de
+dh'
+dha
+dhaibh
+dhaibh-san
+dhaibhsan
+dhan
+dhasan
+dhe
+dhen
+dheth
+dhi
+dhiom
+dhiot
+dhith
+dhiubh
+dhomh
+dhomh-s'
+dhomhsa
+dhu'sa # dhut-sa
+dhuibh
+dhuibhse
+dhuinn
+dhuinne
+dhuit
+dhut
+dhutsa
+dhut-sa
+dhà
+dhà-san
+dhàsan
+dhòmhsa
+diubh
+do
+docha
+don
+dà
+dè
+dè mar
+dé
+dé mar
+dòch'
+dòcha
+e
+eadar
+eatarra
+eatorra
+eile
+esan
+fa
+far
+feud
+fhad
+fheudar
+fhearr
+fhein
+fheudar
+fheàrr
+fhèin
+fhéin
+fhìn
+fo
+fodha
+fodhainn
+foipe
+fon
+fèin
+ga
+gach
+gam
+gan
+ge brith
+ged
+gu
+gu dè
+gu ruige
+gun
+gur
+gus
+i
+iad
+iadsan
+innte
+is
+ise
+le
+leam
+leam-sa
+leamsa
+leat
+leat-sa
+leatha
+leatsa
+leibh
+leis
+leis-san
+leoth'
+leotha
+leotha-san
+linn
+m'
+m'a
+ma
+mac
+man
+mar
+mas
+mathaid
+mi
+mis'
+mise
+mo
+mu
+mu 'n
+mun
+mur
+mura
+mus
+na
+na b'
+na bu
+na iad
+nach
+nad
+nam
+nan
+nar
+nas
+neo
+no
+nuair
+o
+o'n
+oir
+oirbh
+oirbh-se
+oirnn
+oirnne
+oirre
+on
+orm
+orm-sa
+ormsa
+orra
+orra-san
+orrasan
+ort
+os
+r'
+ri
+ribh
+rinn
+ris
+rithe
+rithe-se
+rium
+rium-sa
+riums'
+riumsa
+riut
+riuth'
+riutha
+riuthasan
+ro
+ro'n
+roimh
+roimhe
+romhainn
+romham
+romhpa
+ron
+ruibh
+ruinn
+ruinne
+sa
+san
+sann
+se
+seach
+seo
+seothach
+shin
+sibh
+sibh-se
+sibhse
+sin
+sineach
+sinn
+sinne
+siod
+siodach
+siud
+siudach
+sna # ann an
+sè
+t'
+tarsaing
+tarsainn
+tarsuinn
+thar
+thoigh
+thro
+thu
+thuc'
+thuca
+thugad
+thugaibh
+thugainn
+thugam
+thugamsa
+thuice
+thuige
+thus'
+thusa
+timcheall
+toigh
+toil
+tro
+tro' # troimh
+troimh
+troimhe
+tron
+tu
+tusa
+uair
+ud
+ugaibh
+ugam-s'
+ugam-sa
+uice
+uige
+uige-san
+umad
+unnta # ann an
+ur
+urrainn
+à
+às
+àsan
+á
+ás
+è
+ì
+ò
+ó
+""".split(
+        "\n"
+    )
+)
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -0,0 +1,16 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+
+
+class KurmanjiDefaults(BaseDefaults):
+    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS
+
+
+class Kurmanji(Language):
+    lang = "kmr"
+    Defaults = KurmanjiDefaults
+
+
+__all__ = ["Kurmanji"]
--- a/spacy/lang/kmr/examples.py
+++ b/spacy/lang/kmr/examples.py
@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.kmr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
+    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
+    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
+    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
+    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
+    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
+    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
+    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
+]
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,138 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "sifir",
+    "yek",
+    "du",
+    "sê",
+    "çar",
+    "pênc",
+    "şeş",
+    "heft",
+    "heşt",
+    "neh",
+    "deh",
+    "yazde",
+    "dazde",
+    "sêzde",
+    "çarde",
+    "pazde",
+    "şazde",
+    "hevde",
+    "hejde",
+    "nozde",
+    "bîst",
+    "sî",
+    "çil",
+    "pêncî",
+    "şêst",
+    "heftê",
+    "heştê",
+    "nod",
+    "sed",
+    "hezar",
+    "milyon",
+    "milyar",
+]
+
+_ordinal_words = [
+    "yekem",
+    "yekemîn",
+    "duyem",
+    "duyemîn",
+    "sêyem",
+    "sêyemîn",
+    "çarem",
+    "çaremîn",
+    "pêncem",
+    "pêncemîn",
+    "şeşem",
+    "şeşemîn",
+    "heftem",
+    "heftemîn",
+    "heştem",
+    "heştemîn",
+    "nehem",
+    "nehemîn",
+    "dehem",
+    "dehemîn",
+    "yazdehem",
+    "yazdehemîn",
+    "dazdehem",
+    "dazdehemîn",
+    "sêzdehem",
+    "sêzdehemîn",
+    "çardehem",
+    "çardehemîn",
+    "pazdehem",
+    "pazdehemîn",
+    "şanzdehem",
+    "şanzdehemîn",
+    "hevdehem",
+    "hevdehemîn",
+    "hejdehem",
+    "hejdehemîn",
+    "nozdehem",
+    "nozdehemîn",
+    "bîstem",
+    "bîstemîn",
+    "sîyem",
+    "sîyemîn",
+    "çilem",
+    "çilemîn",
+    "pêncîyem",
+    "pênciyemîn",
+    "şêstem",
+    "şêstemîn",
+    "heftêyem",
+    "heftêyemîn",
+    "heştêyem",
+    "heştêyemîn",
+    "notem",
+    "notemîn",
+    "sedem",
+    "sedemîn",
+    "hezarem",
+    "hezaremîn",
+    "milyonem",
+    "milyonemîn",
+    "milyarem",
+    "milyaremîn",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+
+    if is_digit(text_lower):
+        return True
+
+    return False
+
+
+def is_digit(text):
+    endings = ("em", "yem", "emîn", "yemîn")
+    for ending in endings:
+        to = len(ending)
+        if text.endswith(ending) and text[:-to].isdigit():
+            return True
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/kmr/stop_words.py
+++ b/spacy/lang/kmr/stop_words.py
@ -0,0 +1,44 @@
+STOP_WORDS = set(
+    """
+û
+li
+bi
+di
+da
+de
+ji
+ku
+ew
+ez
+tu
+em
+hûn
+ew
+ev
+min
+te
+wî
+wê
+me
+we
+wan
+vê
+vî
+va
+çi
+kî
+kê
+çawa
+çima
+kengî
+li ku
+çend
+çiqas
+her
+hin
+gelek
+hemû
+kes
+tişt
+""".split()
+)
--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS

-    @classmethod
-    def create_lemmatizer(cls, nlp=None, lookups=None):
-        if lookups is None:
-            lookups = Lookups()
-        return MacedonianLemmatizer(lookups)
-
-
 class Macedonian(Language):
    lang = "mk"
    Defaults = MacedonianDefaults
--- a/spacy/language.py
+++ b/spacy/language.py
@ -5,7 +5,7 @@ import multiprocessing as mp
 import random
 import traceback
 import warnings
-from contextlib import contextmanager
+from contextlib import ExitStack, contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from itertools import chain, cycle
@ -31,6 +31,7 @@ from typing import (
 )

 import srsly
+from cymem.cymem import Pool
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops

 from . import about, ty, util
@ -2091,6 +2092,38 @@ class Language:
                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
                tok2vec.remove_listener(listener, pipe_name)

+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
+        """Begin a block where all resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+
+        Example
+        -------
+        >>> with nlp.memory_zone():
+        ...     for doc in nlp.pipe(texts):
+        ...        process_my_doc(doc)
+        >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
+        """
+        if mem is None:
+            mem = Pool()
+        # The ExitStack allows programmatic nested context managers.
+        # We don't know how many we need, so it would be awkward to have
+        # them as nested blocks.
+        with ExitStack() as stack:
+            contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
+            if hasattr(self.tokenizer, "memory_zone"):
+                contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
+            for _, pipe in self.pipeline:
+                if hasattr(pipe, "memory_zone"):
+                    contexts.append(stack.enter_context(pipe.memory_zone(mem)))
+            yield mem
+
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
    def __init__(self, ArcEager moves, StateClass stcls, Example example):
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
-        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
+        labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
        sent_starts = _get_aligned_sent_starts(example)
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
            doc.c[i].head = new_head.i - i
-            doc.c[i].dep = doc.vocab.strings.add(new_label)
+            doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
    set_children_from_heads(doc.c, 0, doc.length)
    return doc

--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -25,5 +25,7 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map

-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) 
+    cdef vector[hash_t] _transient_keys
+    cdef Pool _non_temp_mem
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,9 +1,14 @@
 # cython: infer_types=True
 # cython: profile=False
 cimport cython
+
+from contextlib import contextmanager
+from typing import Iterator, List, Optional
+
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash32, hash64
+from preshed.maps cimport map_clear

 import srsly

@ -31,7 +36,7 @@ def get_string_id(key):
    This function optimises for convenience over performance, so shouldn't be
    used in tight loops.
    """
-    cdef hash_t str_hash    
+    cdef hash_t str_hash
    if isinstance(key, str):
        if len(key) == 0:
            return 0
@ -45,8 +50,8 @@ def get_string_id(key):
    elif _try_coerce_to_hash(key, &str_hash):
        # Coerce the integral key to the expected primitive hash type.
        # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
-        # downsteam (as these are internally implemented as custom PyObjects 
+        # such as those implemented by numpy are not inadvertently used
+        # downsteam (as these are internally implemented as custom PyObjects
        # whose comparison operators can incur a significant overhead).
        return str_hash
    else:
@ -119,10 +124,11 @@ cdef class StringStore:
        strings (iterable): A sequence of unicode strings to add to the store.
        """
        self.mem = Pool()
+        self._non_temp_mem = self.mem
        self._map = PreshMap()
        if strings is not None:
            for string in strings:
-                self.add(string)
+                self.add(string, allow_transient=False)

    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.
@ -152,14 +158,17 @@ cdef class StringStore:
                return SYMBOLS_BY_INT[str_hash]
            else:
                utf8str = <Utf8Str*>self._map.get(str_hash)
+                if utf8str is NULL:
+                    raise KeyError(Errors.E018.format(hash_value=string_or_id))
+                else:
+                    return decode_Utf8Str(utf8str)
        else:
            # TODO: Raise an error instead
            utf8str = <Utf8Str*>self._map.get(string_or_id)
-
-        if utf8str is NULL:
-            raise KeyError(Errors.E018.format(hash_value=string_or_id))
-        else:
-            return decode_Utf8Str(utf8str)
+            if utf8str is NULL:
+                raise KeyError(Errors.E018.format(hash_value=string_or_id))
+            else:
+                return decode_Utf8Str(utf8str)

    def as_int(self, key):
        """If key is an int, return it; otherwise, get the int value."""
@ -175,12 +184,46 @@ cdef class StringStore:
        else:
            return self[key]

-    def add(self, string):
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self.keys.size() + self._transient_keys.size()
+
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
+        """Begin a block where all resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        self.mem = mem
+        yield mem
+        for key in self._transient_keys:
+            map_clear(self._map.c_map, key)
+        self._transient_keys.clear()
+        self.mem = self._non_temp_mem
+
+    def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
        """Add a string to the StringStore.

        string (str): The string to add.
+        allow_transient (bool): Allow the string to be stored in the 'transient'
+          map, which will be flushed at the end of the memory zone. Strings
+          encountered during arbitrary text processing should be added
+          with allow_transient=True, while labels and other strings used
+          internally should not.
        RETURNS (uint64): The string's hash value.
        """
+        if allow_transient is None:
+            allow_transient = self.mem is not self._non_temp_mem
        cdef hash_t str_hash
        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
@ -188,22 +231,26 @@ cdef class StringStore:

            string = string.encode("utf8")
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        elif isinstance(string, bytes):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        else:
            raise TypeError(Errors.E017.format(value_type=type(string)))
        return str_hash

    def __len__(self):
        """The number of strings in the store.
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
+        else:
+            return self._intern_str(string, allow_transient)

        RETURNS (int): The number of strings in the store.
        """
-        return self.keys.size()
+        return self.keys.size() + self._transient_keys.size()

    def __contains__(self, string_or_id not None):
        """Check whether a string or ID is in the store.
@ -222,12 +269,17 @@ cdef class StringStore:
            pass
        else:
            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
-
+            if self._map.get(string_or_id) is not NULL:
+                return True
+            else:
+                return False
        if str_hash < len(SYMBOLS_BY_INT):
            return True
        else:
-            return self._map.get(str_hash) is not NULL
+            if self._map.get(str_hash) is not NULL:
+                return True
+            else:
+                return False

    def __iter__(self):
        """Iterate over the strings in the store, in order.
@ -240,12 +292,29 @@ cdef class StringStore:
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        for i in range(self._transient_keys.size()):
+            key = self._transient_keys[i]
+            utf8str = <Utf8Str*>self._map.get(key)
+            yield decode_Utf8Str(utf8str)

    def __reduce__(self):
        strings = list(self)
        return (StringStore, (strings,), None, None, None)

+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        transient_hashes = [None] * self._transient_keys.size()
+        for i in range(self._transient_keys.size()):
+            transient_hashes[i] = self._transient_keys[i]
+        return hashes + transient_hashes
+
    def to_disk(self, path):
        """Save the current state to a directory.

@ -269,7 +338,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self

    def to_bytes(self, **kwargs):
@ -289,23 +358,25 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self

    def _reset_and_load(self, strings):
        self.mem = Pool()
+        self._non_temp_mem = self.mem
        self._map = PreshMap()
        self.keys.clear()
+        self._transient_keys.clear()
        for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)

-    cdef const Utf8Str* intern_unicode(self, str py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+        return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)

    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
@ -314,5 +385,8 @@ cdef class StringStore:
            return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
-        self.keys.push_back(key)
+        if allow_transient and self.mem is not self._non_temp_mem:
+            self._transient_keys.push_back(key)
+        else:
+            self.keys.push_back(key)
        return value
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -81,6 +81,11 @@ def bn_tokenizer():
    return get_lang_class("bn")().tokenizer


+@pytest.fixture(scope="session")
+def bo_tokenizer():
+    return get_lang_class("bo")().tokenizer
+
+
@pytest.fixture(scope="session")
 def ca_tokenizer():
    return get_lang_class("ca")().tokenizer
--- a/spacy/tests/lang/bo/init.py
+++ b/spacy/tests/lang/bo/init.py
--- a/spacy/tests/lang/bo/test_text.py
+++ b/spacy/tests/lang/bo/test_text.py
@ -0,0 +1,21 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("999.0", True),
+        ("གཅིག་", True),
+        ("གཉིས་", True),
+        ("ཀླད་ཀོར་", True),
+        ("བཅུ་གཅིག་", True),
+        ("ཁྱི་", False),
+        (",", False),
+    ],
+)
+def test_lex_attrs_like_number(bo_tokenizer, text, match):
+    tokens = bo_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,27 @@
+import pytest
+
+from spacy.lang.kmr.lex_attrs import like_num
+
+
+@pytest.mark.parametrize(
+    "word",
+    [
+        "yekem",
+        "duyemîn",
+        "100em",
+        "dehem",
+        "sedemîn",
+        "34em",
+        "30yem",
+        "20emîn",
+        "50yemîn",
+    ],
+)
+def test_kmr_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["deh"])
+def test_kmr_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
 # fmt: on


--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -18,6 +18,7 @@ LANGUAGES = [
    pytest.param("ar", marks=pytest.mark.slow()),
    pytest.param("bg", marks=pytest.mark.slow()),
    "bn",
+    pytest.param("bo", marks=pytest.mark.slow()),
    pytest.param("ca", marks=pytest.mark.slow()),
    pytest.param("cs", marks=pytest.mark.slow()),
    pytest.param("da", marks=pytest.mark.slow()),
@ -57,6 +58,7 @@ LANGUAGES = [
    pytest.param("tr", marks=pytest.mark.slow()),
    pytest.param("tt", marks=pytest.mark.slow()),
    pytest.param("ur", marks=pytest.mark.slow()),
+    pytest.param("kmr", marks=pytest.mark.slow()),
 ]


--- a/spacy/tests/vocab_vectors/test_memory_zone.py
+++ b/spacy/tests/vocab_vectors/test_memory_zone.py
@ -0,0 +1,36 @@
+from spacy.vocab import Vocab
+
+
+def test_memory_zone_no_insertion():
+    vocab = Vocab()
+    with vocab.memory_zone():
+        pass
+    lex = vocab["horse"]
+    assert lex.text == "horse"
+
+
+def test_memory_zone_insertion():
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+    assert "dog" in vocab
+    assert "horse" not in vocab
+
+
+def test_memory_zone_redundant_insertion():
+    """Test that if we insert an already-existing word while
+    in the memory zone, it stays persistent"""
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+        _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -25,9 +25,7 @@ cdef class Tokenizer:
    cdef PhraseMatcher _special_matcher
    # TODO convert to bool in v4
    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
-    # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int2
+    cdef public int max_cache_size

    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -30,7 +30,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

        vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
        faster_heuristics (bool): Whether to restrict the final
            Matcher-based pass for rules to those containing affixes or space.
            Defaults to True.
+        max_cache_size (int): Maximum number of tokenization chunks to cache.

        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -69,6 +70,7 @@ cdef class Tokenizer:
        self._rules = {}
        self._special_matcher = PhraseMatcher(self.vocab)
        self._load_special_cases(rules)
+        self.max_cache_size = max_cache_size

    @property
    def token_match(self):
@ -397,8 +399,9 @@ cdef class Tokenizer:
                                   has_special, with_special_cases)
        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                            with_special_cases)
-        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
-                          tokens.length - orig_size)
+        if len(self._cache) < self.max_cache_size:
+            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
+                              tokens.length - orig_size)

    cdef str _split_affixes(
        self,
@ -514,9 +517,8 @@ cdef class Tokenizer:
        if n <= 0:
            # avoid mem alloc of zero length
            return 0
-        for i in range(n):
-            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
-                return 0
+        if self.vocab.in_memory_zone:
+            return 0
        # See #1250
        if has_special[0]:
            return 0
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -41,7 +41,9 @@ cdef class Vocab:
    cdef const TokenC* make_fused_token(self, substrings) except NULL

    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL

    cdef PreshMap _by_orth
+    cdef Pool _non_temp_mem
+    cdef vector[attr_t] _transient_orths
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -1,6 +1,8 @@
+from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union

+from cymem.cymem import Pool
 from thinc.types import Floats1d, FloatsXd

 from . import Language
@ -67,6 +69,8 @@ class Vocab:
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
    ) -> Vocab: ...
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...

 def pickle_vocab(vocab: Vocab) -> Any: ...
 def unpickle_vocab(
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,8 +1,11 @@
 import functools
+from contextlib import ExitStack, contextmanager
+from typing import Iterator, Optional

 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
+from preshed.maps cimport map_clear

 from .attrs cimport LANG, ORTH
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,6 +90,12 @@ cdef class Vocab:
        self.lookups = lookups
        self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
+        # During a memory_zone we replace our mem object with one
+        # that's passed to us. We keep a reference to our non-temporary
+        # memory here, in case we need to make an allocation we want to
+        # guarantee is not temporary. This is also how we check whether
+        # we're in a memory zone: we check whether self.mem is self._non_temp_mem
+        self._non_temp_mem = self.mem

    @property
    def vectors(self):
@ -96,7 +105,7 @@ cdef class Vocab:
    def vectors(self, vectors):
        if hasattr(vectors, "strings"):
            for s in vectors.strings:
-                self.strings.add(s)
+                self.strings.add(s, allow_transient=False)
        self._vectors = vectors
        self._vectors.strings = self.strings

@ -107,6 +116,10 @@ cdef class Vocab:
            langfunc = self.lex_attr_getters.get(LANG, None)
        return langfunc("_") if langfunc else ""

+    @property
+    def in_memory_zone(self) -> bool:
+        return self.mem is not self._non_temp_mem
+
    def __len__(self):
        """The current number of lexemes stored.

@ -114,6 +127,33 @@ cdef class Vocab:
        """
        return self.length

+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
+        """Begin a block where resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        # The ExitStack allows programmatic nested context managers.
+        # We don't know how many we need, so it would be awkward to have
+        # them as nested blocks.
+        with ExitStack() as stack:
+            contexts = [stack.enter_context(self.strings.memory_zone(mem))]
+            if hasattr(self.morphology, "memory_zone"):
+                contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
+            if hasattr(self._vectors, "memory_zone"):
+                contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
+            self.mem = mem
+            yield mem
+        self._clear_transient_orths()
+        self.mem = self._non_temp_mem
+
    def add_flag(self, flag_getter, int flag_id=-1):
        """Set a new boolean flag to words in the vocabulary.

@ -148,8 +188,7 @@ cdef class Vocab:

    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
-        `Lexeme` if necessary using memory acquired from the given pool. If the
-        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
+        `Lexeme` if necessary.
        """
        if string == "":
            return &EMPTY_LEXEME
@ -180,19 +219,11 @@ cdef class Vocab:
            return self._new_lexeme(mem, self.strings[orth])

    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
-        # was originally supposed to work. The best solution to the growing
-        # memory use is to periodically reset the vocab, which is an action
-        # that should be up to the user to do (so we don't need to keep track
-        # of the doc ownership).
-        # TODO: Change the C API so that the mem isn't passed in here.
+        # The mem argument is deprecated, replaced by memory zones. Same with
+        # this size heuristic.
        mem = self.mem
-        # if len(string) < 3 or self.length < 10000:
-        #    mem = self.mem
-        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
-        lex.orth = self.strings.add(string)
+        lex.orth = self.strings.add(string, allow_transient=True)
        lex.length = len(string)
        if self.vectors is not None and hasattr(self.vectors, "key2row"):
            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -202,18 +233,25 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, str):
-                    value = self.strings.add(value)
+                    value = self.strings.add(value, allow_transient=True)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
-            self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex

-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1
+        if is_transient and self.in_memory_zone:
+            self._transient_orths.push_back(lex.orth)
+
+    def _clear_transient_orths(self):
+        """Remove transient lexemes from the index (generally at the end of the memory zone)"""
+        for orth in self._transient_orths:
+            map_clear(self._by_orth.c_map, orth)
+        self._transient_orths.clear()

    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.
@ -265,7 +303,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if isinstance(id_or_string, str):
-            orth = self.strings.add(id_or_string)
+            orth = self.strings.add(id_or_string, allow_transient=True)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -417,7 +455,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#get_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.has_vector(key):
@ -436,7 +474,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#set_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=False)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.vectors.is_full and key not in self.vectors:
@ -460,7 +498,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#has_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        return key in self.vectors
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -31,6 +31,12 @@
            "name": "Bengali",
            "has_examples": true
        },
+        {
+            "code": "bo",
+            "name": "Tibetan",
+            "example": "འདི་ཚིག་གྲུབ་རེད།",
+            "has_examples": true
+        },
        {
            "code": "ca",
            "name": "Catalan",
@ -480,6 +486,12 @@
            ],
            "example": "这是一个用于示例的句子。",
            "has_examples": true
+        },
+        {
+            "code": "kmr",
+            "name": "Kurdish Kurmanji",
+            "example": "Ev hevokek e",
+            "has_examples": true
        }
    ],
    "licenses": [
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }

 const navAlert = (
-    <Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
-        💥 Interested in <strong>Premium spaCy Models</strong>?
+    <Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
+        💥 <strong>New:</strong> Case study with S&P Global
    </Link>
 )