Merge branch 'master' into pr/13515

2025-07-14 18:22:27 +03:00 · 2024-09-10 14:21:11 +02:00 · 2024-09-10 14:21:11 +02:00 · 5cbe621cde
commit 5cbe621cde
parent 52822ae280 0190e669c5
41 changed files with 4507 additions and 369 deletions
--- a/.github/workflows/cibuildwheel.yml
+++ b/.github/workflows/cibuildwheel.yml
@ -0,0 +1,92 @@
 name: Build
 on:
  push:
    tags:
      # ytf did they invent their own syntax that's almost regex?
      # ** matches 'zero or more of any character'
      - 'release-v[0-9]+.[0-9]+.[0-9]+**'
      - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
 jobs:
  build_wheels:
    name: Build wheels on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        # macos-13 is an intel runner, macos-14 is apple silicon
        os: [ubuntu-latest, windows-latest, macos-13]
    steps:
      - uses: actions/checkout@v4
      - name: Build wheels
        uses: pypa/cibuildwheel@v2.19.1
        env:
          CIBW_SOME_OPTION: value
        with:
          package-dir: .
          output-dir: wheelhouse
          config-file: "{package}/pyproject.toml"
      - uses: actions/upload-artifact@v4
        with:
          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
          path: ./wheelhouse/*.whl
  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Build sdist
        run: pipx run build --sdist
      - uses: actions/upload-artifact@v4
        with:
          name: cibw-sdist
          path: dist/*.tar.gz
  create_release:
    needs: [build_wheels, build_sdist]
    runs-on: ubuntu-latest
    permissions:
      contents: write
      checks: write
      actions: read
      issues: read
      packages: write
      pull-requests: read
      repository-projects: read
      statuses: read
    steps:
      - name: Get the tag name and determine if it's a prerelease
        id: get_tag_info
        run: |
          FULL_TAG=${GITHUB_REF#refs/tags/}
          if [[ $FULL_TAG == release-* ]]; then
            TAG_NAME=${FULL_TAG#release-}
            IS_PRERELEASE=false
          elif [[ $FULL_TAG == prerelease-* ]]; then
            TAG_NAME=${FULL_TAG#prerelease-}
            IS_PRERELEASE=true
          else
            echo "Tag does not match expected patterns" >&2
            exit 1
          fi
          echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
          echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
          echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
      - uses: actions/download-artifact@v4
        with:
          # unpacks all CIBW artifacts into dist/
          pattern: cibw-*
          path: dist
          merge-multiple: true
      - name: Create Draft Release
        id: create_release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          name: ${{ env.TAG_NAME }}
          draft: true
          prerelease: ${{ env.IS_PRERELEASE }}
          files: "./dist/*" 
--- a/.github/workflows/gputests.yml.disabled
+++ b/.github/workflows/gputests.yml.disabled
--- a/.github/workflows/publish_pypi.yml
+++ b/.github/workflows/publish_pypi.yml
@ -0,0 +1,29 @@
 # The cibuildwheel action triggers on creation of a release, this
 # triggers on publication.
 # The expected workflow is to create a draft release and let the wheels
 # upload, and then hit 'publish', which uploads to PyPi.
 on:
  release:
    types:
      - published
 jobs:
  upload_pypi:
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/spacy
    permissions:
      id-token: write
      contents: read
    if: github.event_name == 'release' && github.event.action == 'published'
    # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
    # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
    steps:
      - uses: robinraju/release-downloader@v1
        with:
          tag: ${{ github.event.release.tag_name }}
          fileName: '*'
          out-file-path: 'dist'
      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/slowtests.yml.disabled
+++ b/.github/workflows/slowtests.yml.disabled
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,5 +11,58 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 [tool.cibuildwheel]
 build = "*"
 skip = "pp* cp36* cp37* cp38* *-win32"
 test-skip = ""
 free-threaded-support = false
 archs = ["native"]
 build-frontend = "default"
 config-settings = {}
 dependency-versions = "pinned"
 environment = { PIP_CONSTRAINT = "build-constraints.txt" }
 environment-pass = []
 build-verbosity = 0
 before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
 before-build = "pip install -r requirements.txt && python setup.py clean"
 repair-wheel-command = ""
 test-command = ""
 before-test = ""
 test-requires = []
 test-extras = []
 container-engine = "docker"
 manylinux-x86_64-image = "manylinux2014"
 manylinux-i686-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"
 manylinux-ppc64le-image = "manylinux2014"
 manylinux-s390x-image = "manylinux2014"
 manylinux-pypy_x86_64-image = "manylinux2014"
 manylinux-pypy_i686-image = "manylinux2014"
 manylinux-pypy_aarch64-image = "manylinux2014"
 musllinux-x86_64-image = "musllinux_1_2"
 musllinux-i686-image = "musllinux_1_2"
 musllinux-aarch64-image = "musllinux_1_2"
 musllinux-ppc64le-image = "musllinux_1_2"
 musllinux-s390x-image = "musllinux_1_2"
 [tool.cibuildwheel.linux]
 repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
 [tool.cibuildwheel.macos]
 repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
 [tool.cibuildwheel.windows]
 [tool.cibuildwheel.pyodide]
 [tool.isort]
 profile = "black"
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
 typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -66,7 +66,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.5"
+__version__ = "3.8.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/lang/bo/init.py
+++ b/spacy/lang/bo/init.py
@ -0,0 +1,16 @@
 from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 class TibetanDefaults(BaseDefaults):
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
 class Tibetan(Language):
    lang = "bo"
    Defaults = TibetanDefaults
 __all__ = ["Tibetan"]
--- a/spacy/lang/bo/examples.py
+++ b/spacy/lang/bo/examples.py
@ -0,0 +1,16 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.bo.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
    "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
    "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
    "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
    "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
    "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
 ]
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -0,0 +1,65 @@
 from ...attrs import LIKE_NUM
 # reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
 _num_words = [
    "ཀླད་ཀོར་",
    "གཅིག་",
    "གཉིས་",
    "གསུམ་",
    "བཞི་",
    "ལྔ་",
    "དྲུག་",
    "བདུན་",
    "བརྒྱད་",
    "དགུ་",
    "བཅུ་",
    "བཅུ་གཅིག་",
    "བཅུ་གཉིས་",
    "བཅུ་གསུམ་",
    "བཅུ་བཞི་",
    "བཅུ་ལྔ་",
    "བཅུ་དྲུག་",
    "བཅུ་བདུན་",
    "བཅུ་པརྒྱད",
    "བཅུ་དགུ་",
    "ཉི་ཤུ་",
    "སུམ་ཅུ",
    "བཞི་བཅུ",
    "ལྔ་བཅུ",
    "དྲུག་ཅུ",
    "བདུན་ཅུ",
    "བརྒྱད་ཅུ",
    "དགུ་བཅུ",
    "བརྒྱ་",
    "སྟོང་",
    "ཁྲི་",
    "ས་ཡ་",
    "	བྱེ་བ་",
    "དུང་ཕྱུར་",
    "ཐེར་འབུམ་",
    "ཐེར་འབུམ་ཆེན་པོ་",
    "ཁྲག་ཁྲིག་",
    "ཁྲག་ཁྲིག་ཆེན་པོ་",
 ]
 def like_num(text):
    """
    Check if text resembles a number
    """
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/bo/stop_words.py
+++ b/spacy/lang/bo/stop_words.py
@ -0,0 +1,198 @@
 # Source: https://zenodo.org/records/10148636
 STOP_WORDS = set(
    """
 འི་
 །
 དུ་
 གིས་
 སོགས་
 ཏེ
 གི་
 རྣམས་
 ནི
 ཀུན་
 ཡི་
 འདི
 ཀྱི་
 སྙེད་
 པས་
 གཞན་
 ཀྱིས་
 ཡི
 ལ
 ནི་
 དང་
 སོགས
 ཅིང་
 ར
 དུ
 མི་
 སུ་
 བཅས་
 ཡོངས་
 ལས
 ཙམ་
 གྱིས་
 དེ་
 ཡང་
 མཐའ་དག་
 ཏུ་
 ཉིད་
 ས
 ཏེ་
 གྱི་
 སྤྱི
 དེ
 ཀ་
 ཡིན་
 ཞིང་
 འདི་
 རུང་
 རང་
 ཞིག་
 སྟེ
 སྟེ་
 ན་རེ
 ངམ
 ཤིང་
 དག་
 ཏོ
 རེ་
 འང་
 ཀྱང་
 ལགས་པ
 ཚུ
 དོ
 ཡིན་པ
 རེ
 ན་རེ་
 ཨེ་
 ཚང་མ
 ཐམས་ཅད་
 དམ་
 འོ་
 ཅིག་
 གྱིན་
 ཡིན
 ན
 ཁོ་ན་
 འམ་
 ཀྱིན་
 ལོ
 ཀྱིས
 བས་
 ལགས་
 ཤིག
 གིས
 ཀི་
 སྣ་ཚོགས་
 རྣམས
 སྙེད་པ
 ཡིས་
 གྱི
 གི
 བམ་
 ཤིག་
 རེ་རེ་
 ནམ
 མིན་
 ནམ་
 ངམ་
 རུ་
 འགའ་
 ཀུན
 ཤས་
 ཏུ
 ཡིས
 གིན་
 གམ་
 འོ
 ཡིན་པ་
 མིན
 ལགས
 གྱིས
 ཅང་
 འགའ
 སམ་
 ཞིག
 འང
 ལས་ཆེ་
 འཕྲལ་
 བར་
 རུ
 དང
 ཡ
 འག
 སམ
 ཀ
 ཅུང་ཟད་
 ཅིག
 ཉིད
 དུ་མ
 མ
 ཡིན་བ
 འམ
 མམ
 དམ
 དག
 ཁོ་ན
 ཀྱི
 ལམ
 ཕྱི་
 ནང་
 ཙམ
 ནོ་
 སོ་
 རམ་
 བོ་
 ཨང་
 ཕྱི
 ཏོ་
 ཚོ
 ལ་ལ་
 ཚོ་
 ཅིང
 མ་གི་
 གེ
 གོ
 ཡིན་ལུགས་
 རོ་
 བོ
 ལགས་པ་
 པས
 རབ་
 འི
 རམ
 བས
 གཞན
 སྙེད་པ་
 འབའ་
 མཾ་
 པོ
 ག་
 ག
 གམ
 སྤྱི་
 བམ
 མོ་
 ཙམ་པ་
 ཤ་སྟག་
 མམ་
 རེ་རེ
 སྙེད
 ཏམ་
 ངོ
 གྲང་
 ཏ་རེ
 ཏམ
 ཁ་
 ངེ་
 ཅོག་
 རིལ་
 ཉུང་ཤས་
 གིང་
 ཚ་
 ཀྱང
 """.split()
 )
--- a/spacy/lang/gd/init.py
+++ b/spacy/lang/gd/init.py
@ -0,0 +1,18 @@
 from typing import Optional
 from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class ScottishDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    stop_words = STOP_WORDS
 class Scottish(Language):
    lang = "gd"
    Defaults = ScottishDefaults
 __all__ = ["Scottish"]
--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -0,0 +1,388 @@
 STOP_WORDS = set(
    """
 'ad
 'ar
 'd # iad
 'g # ag
 'ga
 'gam
 'gan
 'gar
 'gur
 'm # am
 'n # an
 'n seo
 'na
 'nad
 'nam
 'nan
 'nar
 'nuair
 'nur
 's
 'sa
 'san
 'sann
 'se
 'sna
 a
 a'
 a'd # agad
 a'm # agam
 a-chèile
 a-seo
 a-sin
 a-siud
 a chionn
 a chionn 's
 a chèile
 a chéile
 a dh'
 a h-uile
 a seo
 ac' # aca
 aca
 aca-san
 acasan
 ach
 ag
 agad
 agad-sa
 agads'
 agadsa
 agaibh
 agaibhse
 againn
 againne
 agam
 agam-sa
 agams'
 agamsa
 agus
 aice
 aice-se
 aicese
 aig
 aig' # aige
 aige
 aige-san
 aigesan
 air
 air-san
 air neo
 airsan
 am
 an
 an seo
 an sin
 an siud
 an uair
 ann
 ann a
 ann a'
 ann a shin
 ann am
 ann an
 annad
 annam
 annam-s'
 annamsa
 anns
 anns an
 annta
 aon
 ar
 as
 asad
 asda
 asta
 b'
 bho
 bhon
 bhuaidhe # bhuaithe
 bhuainn
 bhuaipe
 bhuaithe
 bhuapa
 bhur
 brì
 bu
 c'à
 car son
 carson
 cha
 chan
 chionn
 choir
 chon
 chun
 chèile
 chéile
 chòir
 cia mheud
 ciamar
 co-dhiubh
 cuide
 cuin
 cuin'
 cuine
 cà
 cà'
 càil
 càit
 càit'
 càite
 cò
 cò mheud
 có
 d'
 da
 de
 dh'
 dha
 dhaibh
 dhaibh-san
 dhaibhsan
 dhan
 dhasan
 dhe
 dhen
 dheth
 dhi
 dhiom
 dhiot
 dhith
 dhiubh
 dhomh
 dhomh-s'
 dhomhsa
 dhu'sa # dhut-sa
 dhuibh
 dhuibhse
 dhuinn
 dhuinne
 dhuit
 dhut
 dhutsa
 dhut-sa
 dhà
 dhà-san
 dhàsan
 dhòmhsa
 diubh
 do
 docha
 don
 dà
 dè
 dè mar
 dé
 dé mar
 dòch'
 dòcha
 e
 eadar
 eatarra
 eatorra
 eile
 esan
 fa
 far
 feud
 fhad
 fheudar
 fhearr
 fhein
 fheudar
 fheàrr
 fhèin
 fhéin
 fhìn
 fo
 fodha
 fodhainn
 foipe
 fon
 fèin
 ga
 gach
 gam
 gan
 ge brith
 ged
 gu
 gu dè
 gu ruige
 gun
 gur
 gus
 i
 iad
 iadsan
 innte
 is
 ise
 le
 leam
 leam-sa
 leamsa
 leat
 leat-sa
 leatha
 leatsa
 leibh
 leis
 leis-san
 leoth'
 leotha
 leotha-san
 linn
 m'
 m'a
 ma
 mac
 man
 mar
 mas
 mathaid
 mi
 mis'
 mise
 mo
 mu
 mu 'n
 mun
 mur
 mura
 mus
 na
 na b'
 na bu
 na iad
 nach
 nad
 nam
 nan
 nar
 nas
 neo
 no
 nuair
 o
 o'n
 oir
 oirbh
 oirbh-se
 oirnn
 oirnne
 oirre
 on
 orm
 orm-sa
 ormsa
 orra
 orra-san
 orrasan
 ort
 os
 r'
 ri
 ribh
 rinn
 ris
 rithe
 rithe-se
 rium
 rium-sa
 riums'
 riumsa
 riut
 riuth'
 riutha
 riuthasan
 ro
 ro'n
 roimh
 roimhe
 romhainn
 romham
 romhpa
 ron
 ruibh
 ruinn
 ruinne
 sa
 san
 sann
 se
 seach
 seo
 seothach
 shin
 sibh
 sibh-se
 sibhse
 sin
 sineach
 sinn
 sinne
 siod
 siodach
 siud
 siudach
 sna # ann an
 sè
 t'
 tarsaing
 tarsainn
 tarsuinn
 thar
 thoigh
 thro
 thu
 thuc'
 thuca
 thugad
 thugaibh
 thugainn
 thugam
 thugamsa
 thuice
 thuige
 thus'
 thusa
 timcheall
 toigh
 toil
 tro
 tro' # troimh
 troimh
 troimhe
 tron
 tu
 tusa
 uair
 ud
 ugaibh
 ugam-s'
 ugam-sa
 uice
 uige
 uige-san
 umad
 unnta # ann an
 ur
 urrainn
 à
 às
 àsan
 á
 ás
 è
 ì
 ò
 ó
 """.split(
        "\n"
    )
 )
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -0,0 +1,16 @@
 from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 class KurmanjiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
 class Kurmanji(Language):
    lang = "kmr"
    Defaults = KurmanjiDefaults
 __all__ = ["Kurmanji"]
--- a/spacy/lang/kmr/examples.py
+++ b/spacy/lang/kmr/examples.py
@ -0,0 +1,17 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.kmr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
 ]
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,138 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "sifir",
    "yek",
    "du",
    "sê",
    "çar",
    "pênc",
    "şeş",
    "heft",
    "heşt",
    "neh",
    "deh",
    "yazde",
    "dazde",
    "sêzde",
    "çarde",
    "pazde",
    "şazde",
    "hevde",
    "hejde",
    "nozde",
    "bîst",
    "sî",
    "çil",
    "pêncî",
    "şêst",
    "heftê",
    "heştê",
    "nod",
    "sed",
    "hezar",
    "milyon",
    "milyar",
 ]
 _ordinal_words = [
    "yekem",
    "yekemîn",
    "duyem",
    "duyemîn",
    "sêyem",
    "sêyemîn",
    "çarem",
    "çaremîn",
    "pêncem",
    "pêncemîn",
    "şeşem",
    "şeşemîn",
    "heftem",
    "heftemîn",
    "heştem",
    "heştemîn",
    "nehem",
    "nehemîn",
    "dehem",
    "dehemîn",
    "yazdehem",
    "yazdehemîn",
    "dazdehem",
    "dazdehemîn",
    "sêzdehem",
    "sêzdehemîn",
    "çardehem",
    "çardehemîn",
    "pazdehem",
    "pazdehemîn",
    "şanzdehem",
    "şanzdehemîn",
    "hevdehem",
    "hevdehemîn",
    "hejdehem",
    "hejdehemîn",
    "nozdehem",
    "nozdehemîn",
    "bîstem",
    "bîstemîn",
    "sîyem",
    "sîyemîn",
    "çilem",
    "çilemîn",
    "pêncîyem",
    "pênciyemîn",
    "şêstem",
    "şêstemîn",
    "heftêyem",
    "heftêyemîn",
    "heştêyem",
    "heştêyemîn",
    "notem",
    "notemîn",
    "sedem",
    "sedemîn",
    "hezarem",
    "hezaremîn",
    "milyonem",
    "milyonemîn",
    "milyarem",
    "milyaremîn",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    if is_digit(text_lower):
        return True
    return False
 def is_digit(text):
    endings = ("em", "yem", "emîn", "yemîn")
    for ending in endings:
        to = len(ending)
        if text.endswith(ending) and text[:-to].isdigit():
            return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/kmr/stop_words.py
+++ b/spacy/lang/kmr/stop_words.py
@ -0,0 +1,44 @@
 STOP_WORDS = set(
    """
 û
 li
 bi
 di
 da
 de
 ji
 ku
 ew
 ez
 tu
 em
 hûn
 ew
 ev
 min
 te
 wî
 wê
 me
 we
 wan
 vê
 vî
 va
 çi
 kî
 kê
 çawa
 çima
 kengî
 li ku
 çend
 çiqas
 her
 hin
 gelek
 hemû
 kes
 tişt
 """.split()
 )
--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return MacedonianLemmatizer(lookups)
 class Macedonian(Language):
    lang = "mk"
    Defaults = MacedonianDefaults
--- a/spacy/language.py
+++ b/spacy/language.py
@ -5,7 +5,7 @@ import multiprocessing as mp
 import random
 import traceback
 import warnings
-from contextlib import contextmanager
+from contextlib import ExitStack, contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from itertools import chain, cycle
@ -31,6 +31,7 @@ from typing import (
 )
 import srsly
 from cymem.cymem import Pool
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops
 from . import about, ty, util
@ -2091,6 +2092,38 @@ class Language:
                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
                tok2vec.remove_listener(listener, pipe_name)
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
        """Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        Example
        -------
        >>> with nlp.memory_zone():
        ...     for doc in nlp.pipe(texts):
        ...        process_my_doc(doc)
        >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
        """
        if mem is None:
            mem = Pool()
        # The ExitStack allows programmatic nested context managers.
        # We don't know how many we need, so it would be awkward to have
        # them as nested blocks.
        with ExitStack() as stack:
            contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
            if hasattr(self.tokenizer, "memory_zone"):
                contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
            for _, pipe in self.pipeline:
                if hasattr(pipe, "memory_zone"):
                    contexts.append(stack.enter_context(pipe.memory_zone(mem)))
            yield mem
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
    def __init__(self, ArcEager moves, StateClass stcls, Example example):
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
-        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
+        labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
        sent_starts = _get_aligned_sent_starts(example)
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
            doc.c[i].head = new_head.i - i
-            doc.c[i].dep = doc.vocab.strings.add(new_label)
+            doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
    set_children_from_heads(doc.c, 0, doc.length)
    return doc
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -25,5 +25,7 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map
-    cdef const Utf8Str* intern_unicode(self, str py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) 
    cdef vector[hash_t] _transient_keys
    cdef Pool _non_temp_mem
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,9 +1,14 @@
 # cython: infer_types=True
 # cython: profile=False
 cimport cython
 from contextlib import contextmanager
 from typing import Iterator, List, Optional
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash32, hash64
 from preshed.maps cimport map_clear
 import srsly
@ -31,7 +36,7 @@ def get_string_id(key):
    This function optimises for convenience over performance, so shouldn't be
    used in tight loops.
    """
-    cdef hash_t str_hash    
+    cdef hash_t str_hash
    if isinstance(key, str):
        if len(key) == 0:
            return 0
@ -45,8 +50,8 @@ def get_string_id(key):
    elif _try_coerce_to_hash(key, &str_hash):
        # Coerce the integral key to the expected primitive hash type.
        # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
+        # such as those implemented by numpy are not inadvertently used
-        # downsteam (as these are internally implemented as custom PyObjects 
+        # downsteam (as these are internally implemented as custom PyObjects
        # whose comparison operators can incur a significant overhead).
        return str_hash
    else:
@ -119,10 +124,11 @@ cdef class StringStore:
        strings (iterable): A sequence of unicode strings to add to the store.
        """
        self.mem = Pool()
        self._non_temp_mem = self.mem
        self._map = PreshMap()
        if strings is not None:
            for string in strings:
-                self.add(string)
+                self.add(string, allow_transient=False)
    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.
@ -152,14 +158,17 @@ cdef class StringStore:
                return SYMBOLS_BY_INT[str_hash]
            else:
                utf8str = <Utf8Str*>self._map.get(str_hash)
                if utf8str is NULL:
                    raise KeyError(Errors.E018.format(hash_value=string_or_id))
                else:
                    return decode_Utf8Str(utf8str)
        else:
            # TODO: Raise an error instead
            utf8str = <Utf8Str*>self._map.get(string_or_id)
-
+            if utf8str is NULL:
-        if utf8str is NULL:
+                raise KeyError(Errors.E018.format(hash_value=string_or_id))
-            raise KeyError(Errors.E018.format(hash_value=string_or_id))
+            else:
-        else:
+                return decode_Utf8Str(utf8str)
            return decode_Utf8Str(utf8str)
    def as_int(self, key):
        """If key is an int, return it; otherwise, get the int value."""
@ -175,12 +184,46 @@ cdef class StringStore:
        else:
            return self[key]
-    def add(self, string):
+    def __len__(self) -> int:
        """The number of strings in the store.
        RETURNS (int): The number of strings in the store.
        """
        return self.keys.size() + self._transient_keys.size()
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
        """Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        """
        if mem is None:
            mem = Pool()
        self.mem = mem
        yield mem
        for key in self._transient_keys:
            map_clear(self._map.c_map, key)
        self._transient_keys.clear()
        self.mem = self._non_temp_mem
    def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
        """Add a string to the StringStore.
        string (str): The string to add.
        allow_transient (bool): Allow the string to be stored in the 'transient'
          map, which will be flushed at the end of the memory zone. Strings
          encountered during arbitrary text processing should be added
          with allow_transient=True, while labels and other strings used
          internally should not.
        RETURNS (uint64): The string's hash value.
        """
        if allow_transient is None:
            allow_transient = self.mem is not self._non_temp_mem
        cdef hash_t str_hash
        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
@ -188,22 +231,26 @@ cdef class StringStore:
            string = string.encode("utf8")
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        elif isinstance(string, bytes):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        else:
            raise TypeError(Errors.E017.format(value_type=type(string)))
        return str_hash
    def __len__(self):
        """The number of strings in the store.
        if string in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string]
        else:
            return self._intern_str(string, allow_transient)
        RETURNS (int): The number of strings in the store.
        """
-        return self.keys.size()
+        return self.keys.size() + self._transient_keys.size()
    def __contains__(self, string_or_id not None):
        """Check whether a string or ID is in the store.
@ -222,12 +269,17 @@ cdef class StringStore:
            pass
        else:
            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            if self._map.get(string_or_id) is not NULL:
-
+                return True
            else:
                return False
        if str_hash < len(SYMBOLS_BY_INT):
            return True
        else:
-            return self._map.get(str_hash) is not NULL
+            if self._map.get(str_hash) is not NULL:
                return True
            else:
                return False
    def __iter__(self):
        """Iterate over the strings in the store, in order.
@ -240,12 +292,29 @@ cdef class StringStore:
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        for i in range(self._transient_keys.size()):
            key = self._transient_keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
    def __reduce__(self):
        strings = list(self)
        return (StringStore, (strings,), None, None, None)
    def values(self) -> List[int]:
        """Iterate over the stored strings hashes in insertion order.
        RETURNS: A list of string hashs.
        """
        cdef int i
        hashes = [None] * self._keys.size()
        for i in range(self._keys.size()):
            hashes[i] = self._keys[i]
        transient_hashes = [None] * self._transient_keys.size()
        for i in range(self._transient_keys.size()):
            transient_hashes[i] = self._transient_keys[i]
        return hashes + transient_hashes
    def to_disk(self, path):
        """Save the current state to a directory.
@ -269,7 +338,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self
    def to_bytes(self, **kwargs):
@ -289,23 +358,25 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self
    def _reset_and_load(self, strings):
        self.mem = Pool()
        self._non_temp_mem = self.mem
        self._map = PreshMap()
        self.keys.clear()
        self._transient_keys.clear()
        for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)
-    cdef const Utf8Str* intern_unicode(self, str py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+        return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
@ -314,5 +385,8 @@ cdef class StringStore:
            return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
-        self.keys.push_back(key)
+        if allow_transient and self.mem is not self._non_temp_mem:
            self._transient_keys.push_back(key)
        else:
            self.keys.push_back(key)
        return value
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -81,6 +81,11 @@ def bn_tokenizer():
    return get_lang_class("bn")().tokenizer
@pytest.fixture(scope="session")
 def bo_tokenizer():
    return get_lang_class("bo")().tokenizer
@pytest.fixture(scope="session")
 def ca_tokenizer():
    return get_lang_class("ca")().tokenizer
--- a/spacy/tests/lang/bo/init.py
+++ b/spacy/tests/lang/bo/init.py
--- a/spacy/tests/lang/bo/test_text.py
+++ b/spacy/tests/lang/bo/test_text.py
@ -0,0 +1,21 @@
 import pytest
@pytest.mark.parametrize(
    "text,match",
    [
        ("10", True),
        ("1", True),
        ("999.0", True),
        ("གཅིག་", True),
        ("གཉིས་", True),
        ("ཀླད་ཀོར་", True),
        ("བཅུ་གཅིག་", True),
        ("ཁྱི་", False),
        (",", False),
    ],
 )
 def test_lex_attrs_like_number(bo_tokenizer, text, match):
    tokens = bo_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].like_num == match
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,27 @@
 import pytest
 from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
    "word",
    [
        "yekem",
        "duyemîn",
        "100em",
        "dehem",
        "sedemîn",
        "34em",
        "30yem",
        "20emîn",
        "50yemîn",
    ],
 )
 def test_kmr_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)
@pytest.mark.parametrize("word", ["deh"])
 def test_kmr_lex_attrs_capitals(word):
    assert like_num(word)
    assert like_num(word.upper())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
 # fmt: on
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -18,6 +18,7 @@ LANGUAGES = [
    pytest.param("ar", marks=pytest.mark.slow()),
    pytest.param("bg", marks=pytest.mark.slow()),
    "bn",
    pytest.param("bo", marks=pytest.mark.slow()),
    pytest.param("ca", marks=pytest.mark.slow()),
    pytest.param("cs", marks=pytest.mark.slow()),
    pytest.param("da", marks=pytest.mark.slow()),
@ -57,6 +58,7 @@ LANGUAGES = [
    pytest.param("tr", marks=pytest.mark.slow()),
    pytest.param("tt", marks=pytest.mark.slow()),
    pytest.param("ur", marks=pytest.mark.slow()),
    pytest.param("kmr", marks=pytest.mark.slow()),
 ]
--- a/spacy/tests/vocab_vectors/test_memory_zone.py
+++ b/spacy/tests/vocab_vectors/test_memory_zone.py
@ -0,0 +1,36 @@
 from spacy.vocab import Vocab
 def test_memory_zone_no_insertion():
    vocab = Vocab()
    with vocab.memory_zone():
        pass
    lex = vocab["horse"]
    assert lex.text == "horse"
 def test_memory_zone_insertion():
    vocab = Vocab()
    _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
    with vocab.memory_zone():
        lex = vocab["horse"]
        assert lex.text == "horse"
    assert "dog" in vocab
    assert "horse" not in vocab
 def test_memory_zone_redundant_insertion():
    """Test that if we insert an already-existing word while
    in the memory zone, it stays persistent"""
    vocab = Vocab()
    _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
    with vocab.memory_zone():
        lex = vocab["horse"]
        assert lex.text == "horse"
        _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -25,9 +25,7 @@ cdef class Tokenizer:
    cdef PhraseMatcher _special_matcher
    # TODO convert to bool in v4
    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
+    cdef public int max_cache_size
    # https://github.com/explosion/spaCy/pull/9150
    cdef int _unused_int2
    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -30,7 +30,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
        vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
        faster_heuristics (bool): Whether to restrict the final
            Matcher-based pass for rules to those containing affixes or space.
            Defaults to True.
        max_cache_size (int): Maximum number of tokenization chunks to cache.
        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -69,6 +70,7 @@ cdef class Tokenizer:
        self._rules = {}
        self._special_matcher = PhraseMatcher(self.vocab)
        self._load_special_cases(rules)
        self.max_cache_size = max_cache_size
    @property
    def token_match(self):
@ -397,8 +399,9 @@ cdef class Tokenizer:
                                   has_special, with_special_cases)
        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                            with_special_cases)
-        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
+        if len(self._cache) < self.max_cache_size:
-                          tokens.length - orig_size)
+            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                              tokens.length - orig_size)
    cdef str _split_affixes(
        self,
@ -514,9 +517,8 @@ cdef class Tokenizer:
        if n <= 0:
            # avoid mem alloc of zero length
            return 0
-        for i in range(n):
+        if self.vocab.in_memory_zone:
-            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
+            return 0
                return 0
        # See #1250
        if has_special[0]:
            return 0
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -41,7 +41,9 @@ cdef class Vocab:
    cdef const TokenC* make_fused_token(self, substrings) except NULL
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
    cdef PreshMap _by_orth
    cdef Pool _non_temp_mem
    cdef vector[attr_t] _transient_orths
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -1,6 +1,8 @@
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
 from cymem.cymem import Pool
 from thinc.types import Floats1d, FloatsXd
 from . import Language
@ -67,6 +69,8 @@ class Vocab:
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
    ) -> Vocab: ...
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
 def pickle_vocab(vocab: Vocab) -> Any: ...
 def unpickle_vocab(
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,8 +1,11 @@
 import functools
 from contextlib import ExitStack, contextmanager
 from typing import Iterator, Optional
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
 from preshed.maps cimport map_clear
 from .attrs cimport LANG, ORTH
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,6 +90,12 @@ cdef class Vocab:
        self.lookups = lookups
        self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
        # During a memory_zone we replace our mem object with one
        # that's passed to us. We keep a reference to our non-temporary
        # memory here, in case we need to make an allocation we want to
        # guarantee is not temporary. This is also how we check whether
        # we're in a memory zone: we check whether self.mem is self._non_temp_mem
        self._non_temp_mem = self.mem
    @property
    def vectors(self):
@ -96,7 +105,7 @@ cdef class Vocab:
    def vectors(self, vectors):
        if hasattr(vectors, "strings"):
            for s in vectors.strings:
-                self.strings.add(s)
+                self.strings.add(s, allow_transient=False)
        self._vectors = vectors
        self._vectors.strings = self.strings
@ -107,6 +116,10 @@ cdef class Vocab:
            langfunc = self.lex_attr_getters.get(LANG, None)
        return langfunc("_") if langfunc else ""
    @property
    def in_memory_zone(self) -> bool:
        return self.mem is not self._non_temp_mem
    def __len__(self):
        """The current number of lexemes stored.
@ -114,6 +127,33 @@ cdef class Vocab:
        """
        return self.length
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
        """Begin a block where resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        """
        if mem is None:
            mem = Pool()
        # The ExitStack allows programmatic nested context managers.
        # We don't know how many we need, so it would be awkward to have
        # them as nested blocks.
        with ExitStack() as stack:
            contexts = [stack.enter_context(self.strings.memory_zone(mem))]
            if hasattr(self.morphology, "memory_zone"):
                contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
            if hasattr(self._vectors, "memory_zone"):
                contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
            self.mem = mem
            yield mem
        self._clear_transient_orths()
        self.mem = self._non_temp_mem
    def add_flag(self, flag_getter, int flag_id=-1):
        """Set a new boolean flag to words in the vocabulary.
@ -148,8 +188,7 @@ cdef class Vocab:
    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
-        `Lexeme` if necessary using memory acquired from the given pool. If the
+        `Lexeme` if necessary.
        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if string == "":
            return &EMPTY_LEXEME
@ -180,19 +219,11 @@ cdef class Vocab:
            return self._new_lexeme(mem, self.strings[orth])
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
+        # The mem argument is deprecated, replaced by memory zones. Same with
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
+        # this size heuristic.
        # was originally supposed to work. The best solution to the growing
        # memory use is to periodically reset the vocab, which is an action
        # that should be up to the user to do (so we don't need to keep track
        # of the doc ownership).
        # TODO: Change the C API so that the mem isn't passed in here.
        mem = self.mem
        # if len(string) < 3 or self.length < 10000:
        #    mem = self.mem
        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
-        lex.orth = self.strings.add(string)
+        lex.orth = self.strings.add(string, allow_transient=True)
        lex.length = len(string)
        if self.vectors is not None and hasattr(self.vectors, "key2row"):
            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -202,18 +233,25 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, str):
-                    value = self.strings.add(value)
+                    value = self.strings.add(value, allow_transient=True)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
+        self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
            self._add_lex_to_vocab(lex.orth, lex)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1
        if is_transient and self.in_memory_zone:
            self._transient_orths.push_back(lex.orth)
    def _clear_transient_orths(self):
        """Remove transient lexemes from the index (generally at the end of the memory zone)"""
        for orth in self._transient_orths:
            map_clear(self._by_orth.c_map, orth)
        self._transient_orths.clear()
    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.
@ -265,7 +303,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if isinstance(id_or_string, str):
-            orth = self.strings.add(id_or_string)
+            orth = self.strings.add(id_or_string, allow_transient=True)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -417,7 +455,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#get_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.has_vector(key):
@ -436,7 +474,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#set_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=False)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.vectors.is_full and key not in self.vectors:
@ -460,7 +498,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#has_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        return key in self.vectors
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -31,6 +31,12 @@
            "name": "Bengali",
            "has_examples": true
        },
        {
            "code": "bo",
            "name": "Tibetan",
            "example": "འདི་ཚིག་གྲུབ་རེད།",
            "has_examples": true
        },
        {
            "code": "ca",
            "name": "Catalan",
@ -480,6 +486,12 @@
            ],
            "example": "这是一个用于示例的句子。",
            "has_examples": true
        },
        {
            "code": "kmr",
            "name": "Kurdish Kurmanji",
            "example": "Ev hevokek e",
            "has_examples": true
        }
    ],
    "licenses": [
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }
 const navAlert = (
-    <Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
+    <Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
-        💥 Interested in <strong>Premium spaCy Models</strong>?
+        💥 <strong>New:</strong> Case study with S&P Global
    </Link>
 )