2025-08-04 12:20:20 +03:00
316 changed files with 6362 additions and 17348 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1 +0,0 @@
 custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
--- a/.github/workflows/cibuildwheel.yml
+++ b/.github/workflows/cibuildwheel.yml
@ -1,99 +0,0 @@
 name: Build
 on:
  push:
    tags:
      # ytf did they invent their own syntax that's almost regex?
      # ** matches 'zero or more of any character'
      - 'release-v[0-9]+.[0-9]+.[0-9]+**'
      - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
 jobs:
  build_wheels:
    name: Build wheels on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        # macos-13 is an intel runner, macos-14 is apple silicon
        os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
    steps:
      - uses: actions/checkout@v4
      # aarch64 (arm) is built via qemu emulation
      # QEMU is sadly too slow. We need to wait for public ARM support
      #- name: Set up QEMU
      #  if: runner.os == 'Linux'
      #  uses: docker/setup-qemu-action@v3
      #  with:
      #    platforms: all
      - name: Build wheels
        uses: pypa/cibuildwheel@v2.21.3
        env:
          CIBW_ARCHS_LINUX: auto
        with:
          package-dir: .
          output-dir: wheelhouse
          config-file: "{package}/pyproject.toml"
      - uses: actions/upload-artifact@v4
        with:
          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
          path: ./wheelhouse/*.whl
  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Build sdist
        run: pipx run build --sdist
      - uses: actions/upload-artifact@v4
        with:
          name: cibw-sdist
          path: dist/*.tar.gz
  create_release:
    needs: [build_wheels, build_sdist]
    runs-on: ubuntu-latest
    permissions:
      contents: write
      checks: write
      actions: read
      issues: read
      packages: write
      pull-requests: read
      repository-projects: read
      statuses: read
    steps:
      - name: Get the tag name and determine if it's a prerelease
        id: get_tag_info
        run: |
          FULL_TAG=${GITHUB_REF#refs/tags/}
          if [[ $FULL_TAG == release-* ]]; then
            TAG_NAME=${FULL_TAG#release-}
            IS_PRERELEASE=false
          elif [[ $FULL_TAG == prerelease-* ]]; then
            TAG_NAME=${FULL_TAG#prerelease-}
            IS_PRERELEASE=true
          else
            echo "Tag does not match expected patterns" >&2
            exit 1
          fi
          echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
          echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
          echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
      - uses: actions/download-artifact@v4
        with:
          # unpacks all CIBW artifacts into dist/
          pattern: cibw-*
          path: dist
          merge-multiple: true
      - name: Create Draft Release
        id: create_release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          name: ${{ env.TAG_NAME }}
          draft: true
          prerelease: ${{ env.IS_PRERELEASE }}
          files: "./dist/*" 
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -15,7 +15,7 @@ jobs:
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
--- a/.github/workflows/gputests.yml.disabled
+++ b/.github/workflows/gputests.yml.disabled
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -16,7 +16,7 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: dessant/lock-threads@v5
+      - uses: dessant/lock-threads@v4
        with:
          process-only: 'issues'
          issue-inactive-days: '30'
--- a/.github/workflows/publish_pypi.yml
+++ b/.github/workflows/publish_pypi.yml
@ -1,29 +0,0 @@
 # The cibuildwheel action triggers on creation of a release, this
 # triggers on publication.
 # The expected workflow is to create a draft release and let the wheels
 # upload, and then hit 'publish', which uploads to PyPi.
 on:
  release:
    types:
      - published
 jobs:
  upload_pypi:
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/spacy
    permissions:
      id-token: write
      contents: read
    if: github.event_name == 'release' && github.event.action == 'published'
    # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
    # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
    steps:
      - uses: robinraju/release-downloader@v1
        with:
          tag: ${{ github.event.release.tag_name }}
          fileName: '*'
          out-file-path: 'dist'
      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/slowtests.yml.disabled
+++ b/.github/workflows/slowtests.yml.disabled
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -18,7 +18,7 @@ jobs:
        run: |
          echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: '3.10'
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -2,8 +2,6 @@ name: tests
 on:
  push:
    tags-ignore:
      - '**'
    branches-ignore:
      - "spacy.io"
      - "nightly.spacy.io"
@ -12,6 +10,7 @@ on:
      - "*.md"
      - "*.mdx"
      - "website/**"
      - ".github/workflows/**"
  pull_request:
    types: [opened, synchronize, reopened, edited]
    paths-ignore:
@ -26,12 +25,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
-          python-version: "3.10"
+          python-version: "3.7"
          architecture: x64
      - name: black
        run: |
@ -45,13 +45,6 @@ jobs:
        run: |
          python -m pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
          # Unfortunately cython-lint isn't working after the shift to Cython 3.
          #- name: cython-lint
          #  run: |
          #    python -m pip install cython-lint -c requirements.txt
          #    # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
          #    cython-lint spacy --ignore E501,W291,E266
  tests:
    name: Test
    needs: Validate
@ -59,18 +52,30 @@ jobs:
      fail-fast: true
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]
-        python_version: ["3.9", "3.12", "3.13"]
+        python_version: ["3.11"]
        include:
          - os: ubuntu-20.04
            python_version: "3.6"
          - os: windows-latest
            python_version: "3.7"
          - os: macos-latest
            python_version: "3.8"
          - os: ubuntu-latest
            python_version: "3.9"
          - os: windows-latest
            python_version: "3.10"
    runs-on: ${{ matrix.os }}
    steps:
      - name: Check out repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          architecture: x64
      - name: Install dependencies
        run: |
@ -84,7 +89,7 @@ jobs:
      - name: Run mypy
        run: |
          python -m mypy spacy
-        if: matrix.python_version != '3.7'
+        if: matrix.python_version != '3.6'
      - name: Delete source directory and .egg-info
        run: |
@ -106,22 +111,22 @@ jobs:
      - name: Test import
        run: python -W error -c "import spacy"
-      - name: "Test download CLI"
+#      - name: "Test download CLI"
-        run: |
+#        run: |
-          python -m spacy download ca_core_news_sm
+#          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
+#          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+#        if: matrix.python_version == '3.9'
-
+#
-      - name: "Test download_url in info CLI"
+#      - name: "Test download_url in info CLI"
-        run: |
+#        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+#          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
+#        if: matrix.python_version == '3.9'
-
+#
-      - name: "Test no warnings on load (#11713)"
+#      - name: "Test no warnings on load (#11713)"
-        run: |
+#        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+#        if: matrix.python_version == '3.9'
      - name: "Test convert CLI"
        run: |
@ -145,19 +150,17 @@ jobs:
          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
        if: matrix.python_version == '3.9'
-      - name: "Test assemble CLI"
+#      - name: "Test assemble CLI"
-        run: |
+#        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          python -m spacy assemble ner_source_sm.cfg output_dir
+#          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        env:
+#        if: matrix.python_version == '3.9'
-          PYTHONWARNINGS: "error,ignore::DeprecationWarning" 
+#
-        if: matrix.python_version == '3.9'
+#      - name: "Test assemble CLI vectors warning"
-
+#        run: |
-      - name: "Test assemble CLI vectors warning"
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-        run: |
+#          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#        if: matrix.python_version == '3.9'
          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
        if: matrix.python_version == '3.9'
      - name: "Install test requirements"
        run: |
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@ -20,12 +20,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
          architecture: x64
      - name: Validate website/meta/universe.json
        run: |
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -35,7 +35,7 @@ so that more people can benefit from it.
 When opening an issue, use a **descriptive title** and include your
 **environment** (operating system, Python version, spaCy version). Our
-[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you
+[issue template](https://github.com/explosion/spaCy/issues/new) helps you
 remember the most important details to include. If you've discovered a bug, you
 can also submit a [regression test](#fixing-bugs) straight away. When you're
 opening an issue to report the bug, simply refer to your pull request in the
@ -449,12 +449,13 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
  [`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
  [`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
  to make it easier to find. Those are also the topics we're linking to from the
-  spaCy website. If you're sharing your project on X, feel free to tag
+  spaCy website. If you're sharing your project on Twitter, feel free to tag
-  [@spacy_io](https://x.com/spacy_io) so we can check it out.
+  [@spacy_io](https://twitter.com/spacy_io) so we can check it out.
- Once your extension is published, you can open a
+- Once your extension is published, you can open an issue on the
-  [PR](https://github.com/explosion/spaCy/pulls) to suggest it for the
+  [issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the
-  [Universe](https://spacy.io/universe) page.
+  [resources directory](https://spacy.io/usage/resources#extensions) on the
  website.
 📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)
-Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -4,6 +4,5 @@ include README.md
 include pyproject.toml
 include spacy/py.typed
 recursive-include spacy/cli *.yml
 recursive-include spacy/tests *.json
 recursive-include licenses *
 recursive-exclude spacy *.cpp
--- a/4
+++ b/4
@ -1,11 +1,11 @@
 SHELL := /bin/bash
 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==1.0.3
+override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
 endif
 ifndef PYVER
-override PYVER = 3.8
+override PYVER = 3.6
 endif
 VENV := ./env$(PYVER)
--- a/README.md
+++ b/README.md
@ -6,20 +6,23 @@ spaCy is a library for **advanced Natural Language Processing** in Python and
 Cython. It's built on the very latest research, and was designed from day one to
 be used in real products.
-spaCy comes with [pretrained pipelines](https://spacy.io/models) and currently
+spaCy comes with
-supports tokenization and training for **70+ languages**. It features
+[pretrained pipelines](https://spacy.io/models) and
-state-of-the-art speed and **neural network models** for tagging, parsing,
+currently supports tokenization and training for **70+ languages**. It features
-**named entity recognition**, **text classification** and more, multi-task
+state-of-the-art speed and **neural network models** for tagging,
-learning with pretrained **transformers** like BERT, as well as a
+parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
 production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the
+open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
-💫 **Version 3.8 out now!**
+💥 **We'd love to hear more about your experience with spaCy!**
 [Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
 💫 **Version 3.5 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
-[![tests](https://github.com/explosion/spaCy/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
+[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
 [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
@ -28,47 +31,39 @@ open-source software, released under the
 <br />
 [![PyPi downloads](https://static.pepy.tech/personalized-badge/spacy?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/spacy/)
 [![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?label=conda%20downloads)](https://anaconda.org/conda-forge/spacy)
 [![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
 ## 📖 Documentation
 | Documentation                 |                                                                        |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------- | ---------------------------------------------------------------------- |
 | ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      |
 | 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     |
 | 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         |
 | 🪐 **[Project Templates]**    | End-to-end workflows you can clone, modify and run.                    |
 | 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                |
 | ⏩ **[GPU Processing]**                                                                                                                                                                                                    | Use spaCy with CUDA-compatible GPU processing.                                                                                                                                                                                                                                                                                               |
 | 📦 **[Models]**               | Download trained pipelines for spaCy.                                  |
 | 🦙 **[Large Language Models]**                                                                                                                                                                                            | Integrate LLMs into spaCy pipelines.                                                                                                                                                                                                                                                                                                        |
 | 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         |
 | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
 | 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
 | 📰 **[Blog]**                                                                                                                                                                                                             | Read about current spaCy and Prodigy development, releases, talks and more from Explosion.                                                                                                                                                                                                                 |
 | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
 | 🔴 **[Live Stream]**                                                                                                                                                                                                       | Join Matt as he works on spaCy and chat about NLP, live every week.                                                                                                                                                                                                                                                                         |
 | 🛠 **[Changelog]** | Changes and version history. |
 | 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
-| 👕 **[Swag]**                                                                                                                                                                                                             | Support us and our work with unique, custom-designed swag!                                                                                                                                                                                                                                                                                   |
+| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
-| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more &rarr;](https://explosion.ai/tailored-solutions)**                 |
+| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
 [usage guides]: https://spacy.io/usage/
 [api reference]: https://spacy.io/api/
 [gpu processing]: https://spacy.io/usage#gpu
 [models]: https://spacy.io/models
 [large language models]: https://spacy.io/usage/large-language-models
 [universe]: https://spacy.io/universe
-[spacy vs code extension]: https://github.com/explosion/spacy-vscode
+[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
 [videos]: https://www.youtube.com/c/ExplosionAI
 [live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
 [online course]: https://course.spacy.io
 [blog]: https://explosion.ai
 [project templates]: https://github.com/explosion/projects
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 [swag]: https://explosion.ai/merch
 ## 💬 Where to ask questions
@ -80,14 +75,13 @@ more people can benefit from it.
 | Type                            | Platforms                               |
 | ------------------------------- | --------------------------------------- |
 | 🚨 **Bug Reports**              | [GitHub Issue Tracker]                  |
-| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] · [Live Stream]    |
+| 🎁 **Feature Requests & Ideas** | [GitHub Discussions]                    |
 | 👩‍💻 **Usage Questions**          | [GitHub Discussions] · [Stack Overflow] |
-| 🗯 **General Discussion**        | [GitHub Discussions] · [Live Stream]   |
+| 🗯 **General Discussion**        | [GitHub Discussions]                    |
 [github issue tracker]: https://github.com/explosion/spaCy/issues
 [github discussions]: https://github.com/explosion/spaCy/discussions
 [stack overflow]: https://stackoverflow.com/questions/tagged/spacy
 [live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
 ## Features
@ -98,9 +92,7 @@ more people can benefit from it.
 - State-of-the-art speed
 - Production-ready **training system**
 - Linguistically-motivated **tokenization**
- Components for named **entity recognition**, part-of-speech-tagging,
+- Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more
  dependency parsing, sentence segmentation, **text classification**,
  lemmatization, morphological analysis, entity linking and more
 - Easily extensible with **custom components** and attributes
 - Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
 - Built in **visualizers** for syntax and NER
@ -117,7 +109,7 @@ For detailed installation instructions, see the
 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
  Studio)
- **Python version**: Python >=3.7, <3.13 (only 64 bit)
+- **Python version**: Python 3.6+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)
 [pip]: https://pypi.org/project/spacy/
@ -126,8 +118,8 @@ For detailed installation instructions, see the
 ### pip
 Using pip, spaCy releases are available as source packages and binary wheels.
-Before you install spaCy and its dependencies, make sure that your `pip`,
+Before you install spaCy and its dependencies, make sure that
-`setuptools` and `wheel` are up to date.
+your `pip`, `setuptools` and `wheel` are up to date.
 ```bash
 pip install -U pip setuptools wheel
@ -182,9 +174,9 @@ with the new version.
 ## 📦 Download model packages
-Trained pipelines for spaCy can be installed as **Python packages**. This means
+Trained pipelines for spaCy can be installed as **Python packages**. This
-that they're a component of your application, just like any other module. Models
+means that they're a component of your application, just like any other module.
-can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
+Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
 command, or manually by pointing pip to a path or URL.
 | Documentation              |                                                                  |
@ -250,7 +242,8 @@ do that depends on your system.
 | **Mac**     | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled.                                                                                        |
 | **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
-For more details and instructions, see the documentation on
+For more details
 and instructions, see the documentation on
 [compiling spaCy from source](https://spacy.io/usage#source) and the
 [quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
 commands for your platform and Python version.
--- a/bin/release.sh
+++ b/bin/release.sh
@ -1,20 +0,0 @@
 #!/usr/bin/env bash
 set -e
 # Insist repository is clean
 git diff-index --quiet HEAD
 version=$(grep "__version__ = " spacy/about.py)
 version=${version/__version__ = }
 version=${version/\'/}
 version=${version/\'/}
 version=${version/\"/}
 version=${version/\"/}
 echo "Pushing release-v"$version
 git tag -d release-v$version || true
 git push origin :release-v$version || true
 git tag release-v$version
 git push origin release-v$version
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,2 +1,9 @@
-# build version constraints for use with wheelwright
+# build version constraints for use with wheelwright + multibuild
-numpy>=2.0.0,<3.0.0
+numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
 numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
 numpy==1.21.3; python_version=='3.10'
 numpy==1.23.2; python_version=='3.11'
 numpy; python_version>='3.12'
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@ -158,45 +158,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 SciPy
 -----
 * Files: scorer.py
 The implementation of trapezoid() is adapted from SciPy, which is distributed
 under the following license:
 New BSD License
 Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following
   disclaimer in the documentation and/or other materials provided
   with the distribution.
 3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived
   from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,67 +1,14 @@
 [build-system]
 requires = [
    "setuptools",
-    "cython>=3.0,<4.0",
+    "cython>=0.25,<3.0",
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.3.4,<8.4.0",
+    "thinc>=8.1.8,<8.2.0",
-    "numpy>=2.0.0,<3.0.0"
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 [tool.cibuildwheel]
 build = "*"
 skip = "pp* cp36* cp37* cp38* *-win32 *i686*"
 test-skip = ""
 free-threaded-support = false
 archs = ["native"]
 build-frontend = "default"
 config-settings = {}
 dependency-versions = "pinned"
 environment = { PIP_CONSTRAINT = "build-constraints.txt" }
 environment-pass = []
 build-verbosity = 0
 before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
 before-build = "pip install -r requirements.txt && python setup.py clean"
 repair-wheel-command = ""
 test-command = ""
 before-test = ""
 test-requires = []
 test-extras = []
 container-engine = "docker"
 manylinux-x86_64-image = "manylinux2014"
 manylinux-i686-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"
 manylinux-ppc64le-image = "manylinux2014"
 manylinux-s390x-image = "manylinux2014"
 manylinux-pypy_x86_64-image = "manylinux2014"
 manylinux-pypy_i686-image = "manylinux2014"
 manylinux-pypy_aarch64-image = "manylinux2014"
 musllinux-x86_64-image = "musllinux_1_2"
 musllinux-i686-image = "musllinux_1_2"
 musllinux-aarch64-image = "musllinux_1_2"
 musllinux-ppc64le-image = "musllinux_1_2"
 musllinux-s390x-image = "musllinux_1_2"
 [tool.cibuildwheel.linux]
 repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
 [tool.cibuildwheel.macos]
 repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
 [tool.cibuildwheel.windows]
 [tool.cibuildwheel.pyodide]
 [tool.isort]
 profile = "black"
--- a/requirements.txt
+++ b/requirements.txt
@ -3,36 +3,39 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.3.4,<8.4.0
+thinc>=8.1.8,<8.2.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer-slim>=0.3.0,<1.0.0
+typer>=0.3.0,<0.10.0
-weasel>=0.1.0,<0.5.0
+pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
 # Third party dependencies
-numpy>=2.0.0,<3.0.0
+numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
 typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
-cython>=3.0,<4.0
+cython>=0.25,<3.0
 pytest>=5.2.0,!=7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
 black==22.3.0
 cython-lint>=0.15.0
 isort>=5.0,<6.0
--- a/setup.cfg
+++ b/setup.cfg
@ -17,11 +17,12 @@ classifiers =
    Operating System :: Microsoft :: Windows
    Programming Language :: Cython
    Programming Language :: Python :: 3
    Programming Language :: Python :: 3.6
    Programming Language :: Python :: 3.7
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
    Programming Language :: Python :: 3.12
    Programming Language :: Python :: 3.13
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -30,18 +31,15 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.9,<3.14
+python_requires = >=3.6
 # NOTE: This section is superseded by pyproject.toml and will be removed in
 # spaCy v4
 setup_requires =
-    cython>=3.0,<4.0
+    cython>=0.25,<3.0
-    numpy>=2.0.0,<3.0.0; python_version < "3.9"
+    numpy>=1.15.0
    numpy>=2.0.0,<3.0.0; python_version >= "3.9"
    # We also need our Cython packages here to compile against
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.3.4,<8.4.0
+    thinc>=8.1.8,<8.2.0
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.11,<3.1.0
@ -49,22 +47,24 @@ install_requires =
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.3.4,<8.4.0
+    thinc>=8.1.8,<8.2.0
    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
    weasel>=0.1.0,<0.5.0
    # Third-party dependencies
-    typer-slim>=0.3.0,<1.0.0
+    typer>=0.3.0,<0.10.0
    pathy>=0.10.0
    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
-    numpy>=1.15.0; python_version < "3.9"
+    numpy>=1.15.0
    numpy>=1.19.0; python_version >= "3.9"
    requests>=2.13.0,<3.0.0
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
    jinja2
    # Official Python utilities
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
 console_scripts =
@ -74,7 +74,9 @@ console_scripts =
 lookups =
    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.1.2,<1.4.0
+    spacy_transformers>=1.1.2,<1.3.0
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
    cupy>=5.0.0b4,<13.0.0
 cuda80 =
@ -109,12 +111,10 @@ cuda117 =
    cupy-cuda117>=5.0.0b4,<13.0.0
 cuda11x =
    cupy-cuda11x>=11.0.0,<13.0.0
 cuda12x =
    cupy-cuda12x>=11.5.0,<13.0.0
 cuda-autodetect =
    cupy-wheel>=11.0.0,<13.0.0
 apple =
-    thinc-apple-ops>=1.0.0,<2.0.0
+    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.5.2,!=0.6.1
--- a/setup.py
+++ b/setup.py
@ -1,9 +1,10 @@
 #!/usr/bin/env python
 from setuptools import Extension, setup, find_packages
 import sys
 import platform
 import numpy
-from setuptools.command.build_ext import build_ext
+from distutils.command.build_ext import build_ext
-from sysconfig import get_path
+from distutils.sysconfig import get_python_inc
 from pathlib import Path
 import shutil
 from Cython.Build import cythonize
@ -78,7 +79,6 @@ COMPILER_DIRECTIVES = {
    "language_level": -3,
    "embedsignature": True,
    "annotation_typing": False,
    "profile": sys.version_info < (3, 12),
 }
 # Files to copy into the package that are otherwise not included
 COPY_FILES = {
@ -88,6 +88,30 @@ COPY_FILES = {
 }
 def is_new_osx():
    """Check whether we're on OSX >= 10.7"""
    if sys.platform != "darwin":
        return False
    mac_ver = platform.mac_ver()[0]
    if mac_ver.startswith("10"):
        minor_version = int(mac_ver.split(".")[1])
        if minor_version >= 7:
            return True
        else:
            return False
    return False
 if is_new_osx():
    # On Mac, use libc++ because Apple deprecated use of
    # libstdc
    COMPILE_OPTIONS["other"].append("-stdlib=libc++")
    LINK_OPTIONS["other"].append("-lc++")
    # g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
    # See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
    LINK_OPTIONS["other"].append("-nodefaultlibs")
 # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
 # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
 class build_ext_options:
@ -180,7 +204,7 @@ def setup_package():
    include_dirs = [
        numpy.get_include(),
-        get_path("include"),
+        get_python_inc(plat_specific=True),
    ]
    ext_modules = []
    ext_modules.append(
--- a/spacy/init.py
+++ b/spacy/init.py
@ -17,7 +17,6 @@ from .cli.info import info  # noqa: F401
 from .errors import Errors
 from .glossary import explain  # noqa: F401
 from .language import Language
 from .registrations import REGISTRY_POPULATED, populate_registry
 from .util import logger, registry  # noqa: F401
 from .vocab import Vocab
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,7 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.8.7"
+__version__ = "3.7.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
 __projects_branch__ = "v3"
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,4 +1,3 @@
 # cython: profile=False
 from .errors import Errors
 IOB_STRINGS = ("", "I", "O", "B")
@ -118,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        if "pos" in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")  # no-cython-lint
+            morphs = stringy_attrs.pop("morph")
        if "number" in stringy_attrs:
            stringy_attrs.pop("number")
        if "tenspect" in stringy_attrs:
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,7 +1,5 @@
 from wasabi import msg
 # Needed for testing
 from . import download as download_module  # noqa: F401
 from ._util import app, setup_cli  # noqa: F401
 from .apply import apply  # noqa: F401
 from .assemble import assemble_cli  # noqa: F401
@ -16,7 +14,6 @@ from .debug_diff import debug_diff  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .find_function import find_function  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
 from .info import info  # noqa: F401
 from .init_config import fill_config, init_config  # noqa: F401
@ -24,17 +21,15 @@ from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .package import package  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .profile import profile  # noqa: F401
-from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
-from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
-from .project.document import (  # type: ignore[attr-defined]  # noqa: F401
+from .project.document import project_document  # noqa: F401
-    project_document,
+from .project.dvc import project_update_dvc  # noqa: F401
-)
+from .project.pull import project_pull  # noqa: F401
-from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401
+from .project.push import project_push  # noqa: F401
-from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401
+from .project.run import project_run  # noqa: F401
-from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401
+from .train import train_cli  # noqa: F401
-from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401
+from .validate import validate  # noqa: F401
 from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401
 from .validate import validate  # type: ignore[attr-defined]  # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -25,11 +25,10 @@ from thinc.api import Config, ConfigValidationError, require_gpu
 from thinc.util import gpu_is_available
 from typer.main import get_command
 from wasabi import Printer, msg
 from weasel import app as project_cli
 from .. import about
 from ..compat import Literal
-from ..schemas import validate
+from ..schemas import ProjectConfigSchema, validate
 from ..util import (
    ENV_VARS,
    SimpleFrozenDict,
@ -41,10 +40,15 @@ from ..util import (
    run_command,
 )
 if TYPE_CHECKING:
    from pathy import FluidPath  # noqa: F401
 SDIST_SUFFIX = ".tar.gz"
 WHEEL_SUFFIX = "-py3-none-any.whl"
 PROJECT_FILE = "project.yml"
 PROJECT_LOCK = "project.lock"
 COMMAND = "python -m spacy"
 NAME = "spacy"
 HELP = """spaCy Command-line Interface
@ -70,10 +74,11 @@ Opt = typer.Option
 app = typer.Typer(name=NAME, help=HELP)
 benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
-app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
+app.add_typer(project_cli)
 app.add_typer(debug_cli)
 app.add_typer(benchmark_cli)
 app.add_typer(init_cli)
@ -148,6 +153,148 @@ def _parse_override(value: Any) -> Any:
        return str(value)
 def load_project_config(
    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
 ) -> Dict[str, Any]:
    """Load the project.yml file from a directory and validate it. Also make
    sure that all directories defined in the config exist.
    path (Path): The path to the project directory.
    interpolate (bool): Whether to substitute project variables.
    overrides (Dict[str, Any]): Optional config overrides.
    RETURNS (Dict[str, Any]): The loaded project.yml.
    """
    config_path = path / PROJECT_FILE
    if not config_path.exists():
        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
    try:
        config = srsly.read_yaml(config_path)
    except ValueError as e:
        msg.fail(invalid_err, e, exits=1)
    errors = validate(ProjectConfigSchema, config)
    if errors:
        msg.fail(invalid_err)
        print("\n".join(errors))
        sys.exit(1)
    validate_project_version(config)
    validate_project_commands(config)
    if interpolate:
        err = f"{PROJECT_FILE} validation error"
        with show_validation_error(title=err, hint_fill=False):
            config = substitute_project_variables(config, overrides)
    # Make sure directories defined in config exist
    for subdir in config.get("directories", []):
        dir_path = path / subdir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    return config
 def substitute_project_variables(
    config: Dict[str, Any],
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    key: str = "vars",
    env_key: str = "env",
 ) -> Dict[str, Any]:
    """Interpolate variables in the project file using the config system.
    config (Dict[str, Any]): The project config.
    overrides (Dict[str, Any]): Optional config overrides.
    key (str): Key containing variables in project config.
    env_key (str): Key containing environment variable mapping in project config.
    RETURNS (Dict[str, Any]): The interpolated project config.
    """
    config.setdefault(key, {})
    config.setdefault(env_key, {})
    # Substitute references to env vars with their values
    for config_var, env_var in config[env_key].items():
        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
    # Need to put variables in the top scope again so we can have a top-level
    # section "project" (otherwise, a list of commands in the top scope wouldn't)
    # be allowed by Thinc's config system
    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
    interpolated = cfg.interpolate()
    return dict(interpolated["project"])
 def validate_project_version(config: Dict[str, Any]) -> None:
    """If the project defines a compatible spaCy version range, chec that it's
    compatible with the current version of spaCy.
    config (Dict[str, Any]): The loaded config.
    """
    spacy_version = config.get("spacy_version", None)
    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
        err = (
            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
            f"that's not compatible with the version of spaCy you're running "
            f"({about.__version__}). You can edit version requirement in the "
            f"{PROJECT_FILE} to load it, but the project may not run as expected."
        )
        msg.fail(err, exits=1)
 def validate_project_commands(config: Dict[str, Any]) -> None:
    """Check that project commands and workflows are valid, don't contain
    duplicates, don't clash  and only refer to commands that exist.
    config (Dict[str, Any]): The loaded config.
    """
    command_names = [cmd["name"] for cmd in config.get("commands", [])]
    workflows = config.get("workflows", {})
    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
    if duplicates:
        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
        msg.fail(err, exits=1)
    for workflow_name, workflow_steps in workflows.items():
        if workflow_name in command_names:
            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
            msg.fail(err, exits=1)
        for step in workflow_steps:
            if step not in command_names:
                msg.fail(
                    f"Unknown command specified in workflow '{workflow_name}': {step}",
                    f"Workflows can only refer to commands defined in the 'commands' "
                    f"section of the {PROJECT_FILE}.",
                    exits=1,
                )
 def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
    """Get the hash for a JSON-serializable object.
    data: The data to hash.
    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
    RETURNS (str): The hash.
    """
    if isinstance(data, dict):
        data = {k: v for k, v in data.items() if k not in exclude}
    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
    return hashlib.md5(data_str).hexdigest()
 def get_checksum(path: Union[Path, str]) -> str:
    """Get the checksum for a file or directory given its file path. If a
    directory path is provided, this uses all files in that directory.
    path (Union[Path, str]): The file or directory path.
    RETURNS (str): The checksum.
    """
    path = Path(path)
    if not (path.is_file() or path.is_dir()):
        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
    if path.is_file():
        return hashlib.md5(Path(path).read_bytes()).hexdigest()
    else:
        # TODO: this is currently pretty slow
        dir_checksum = hashlib.md5()
        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
            dir_checksum.update(sub_file.read_bytes())
        return dir_checksum.hexdigest()
@contextmanager
 def show_validation_error(
    file_path: Optional[Union[str, Path]] = None,
@ -205,10 +352,166 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
    """Upload a file.
    src (Path): The source path.
    url (str): The destination URL to upload to.
    """
    import smart_open
    # Create parent directories for local paths
    if isinstance(dest, Path):
        if not dest.parent.exists():
            dest.parent.mkdir(parents=True)
    dest = str(dest)
    with smart_open.open(dest, mode="wb") as output_file:
        with src.open(mode="rb") as input_file:
            output_file.write(input_file.read())
 def download_file(
    src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
 ) -> None:
    """Download a file using smart_open.
    url (str): The URL of the file.
    dest (Path): The destination path.
    force (bool): Whether to force download even if file exists.
        If False, the download will be skipped.
    """
    import smart_open
    if dest.exists() and not force:
        return None
    src = str(src)
    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
            shutil.copyfileobj(input_file, output_file)
 def ensure_pathy(path):
    """Temporary helper to prevent importing Pathy globally (which can cause
    slow and annoying Google Cloud warning)."""
    from pathy import Pathy  # noqa: F811
    return Pathy.fluid(path)
 def git_checkout(
    repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
 ):
    git_version = get_git_version()
    if dest.exists():
        msg.fail("Destination of checkout must not exist", exits=1)
    if not dest.parent.exists():
        msg.fail("Parent of destination of checkout must exist", exits=1)
    if sparse and git_version >= (2, 22):
        return git_sparse_checkout(repo, subpath, dest, branch)
    elif sparse:
        # Only show warnings if the user explicitly wants sparse checkout but
        # the Git version doesn't support it
        err_old = (
            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
            f"that doesn't fully support sparse checkout yet."
        )
        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
        msg.warn(
            f"{err_unk if git_version == (0, 0) else err_old} "
            f"This means that more files than necessary may be downloaded "
            f"temporarily. To only download the files needed, make sure "
            f"you're using Git v2.22 or above."
        )
    with make_tempdir() as tmp_dir:
        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
        run_command(cmd, capture=True)
        # We need Path(name) to make sure we also support subdirectories
        try:
            source_path = tmp_dir / Path(subpath)
            if not is_subpath_of(tmp_dir, source_path):
                err = f"'{subpath}' is a path outside of the cloned repository."
                msg.fail(err, repo, exits=1)
            shutil.copytree(str(source_path), str(dest))
        except FileNotFoundError:
            err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
            msg.fail(err, repo, exits=1)
 def git_sparse_checkout(repo, subpath, dest, branch):
    # We're using Git, partial clone and sparse checkout to
    # only clone the files we need
    # This ends up being RIDICULOUS. omg.
    # So, every tutorial and SO post talks about 'sparse checkout'...But they
    # go and *clone* the whole repo. Worthless. And cloning part of a repo
    # turns out to be completely broken. The only way to specify a "path" is..
    # a path *on the server*? The contents of which, specifies the paths. Wat.
    # Obviously this is hopelessly broken and insecure, because you can query
    # arbitrary paths on the server! So nobody enables this.
    # What we have to do is disable *all* files. We could then just checkout
    # the path, and it'd "work", but be hopelessly slow...Because it goes and
    # transfers every missing object one-by-one. So the final piece is that we
    # need to use some weird git internals to fetch the missings in bulk, and
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        # This is the "clone, but don't download anything" part.
        cmd = (
            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
            f"-b {branch} --filter=blob:none"
        )
        run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
        ret = run_command(cmd, capture=True)
        git_repo = _http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
        if not missings:
            err = (
                f"Could not find any relevant files for '{subpath}'. "
                f"Did you specify a correct and complete path within repo '{repo}' "
                f"and branch {branch}?"
            )
            msg.fail(err, exits=1)
        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
        run_command(cmd, capture=True)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
        run_command(cmd, capture=True)
        # Get a subdirectory of the cloned path, if appropriate
        source_path = tmp_dir / Path(subpath)
        if not is_subpath_of(tmp_dir, source_path):
            err = f"'{subpath}' is a path outside of the cloned repository."
            msg.fail(err, repo, exits=1)
        shutil.move(str(source_path), str(dest))
 def git_repo_branch_exists(repo: str, branch: str) -> bool:
    """Uses 'git ls-remote' to check if a repository and branch exists
    repo (str): URL to get repo.
    branch (str): Branch on repo to check.
    RETURNS (bool): True if repo:branch exists.
    """
    get_git_version()
    cmd = f"git ls-remote {repo} {branch}"
    # We might be tempted to use `--exit-code` with `git ls-remote`, but
    # `run_command` handles the `returncode` for us, so we'll rely on
    # the fact that stdout returns '' if the requested branch doesn't exist
    ret = run_command(cmd, capture=True)
    exists = ret.stdout != ""
    return exists
 def get_git_version(
    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 ) -> Tuple[int, int]:
    """Get the version of git and raise an error if calling 'git --version' fails.
    error (str): The error message to show.
    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
        (0, 0) if the version couldn't be determined.
@ -224,6 +527,30 @@ def get_git_version(
    return int(version[0]), int(version[1])
 def _http_to_git(repo: str) -> str:
    if repo.startswith("http://"):
        repo = repo.replace(r"http://", r"https://")
    if repo.startswith(r"https://"):
        repo = repo.replace("https://", "git@").replace("/", ":", 1)
        if repo.endswith("/"):
            repo = repo[:-1]
        repo = f"{repo}.git"
    return repo
 def is_subpath_of(parent, child):
    """
    Check whether `child` is a path contained within `parent`.
    """
    # Based on https://stackoverflow.com/a/37095733 .
    # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
    # we can stop using crusty old os.path functions.
    parent_realpath = os.path.realpath(parent)
    child_realpath = os.path.realpath(child)
    return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
@overload
 def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
    ...
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -133,9 +133,7 @@ def apply(
    if len(text_files) > 0:
        streams.append(_stream_texts(text_files))
    datagen = cast(DocOrStrStream, chain(*streams))
-    for doc in tqdm.tqdm(
+    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
        nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
    ):
        docbin.add(doc)
    if output_file.suffix == "":
        output_file = output_file.with_suffix(".spacy")
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@ -40,8 +40,7 @@ def assemble_cli(
    DOCS: https://spacy.io/api/cli#assemble
    """
-    if verbose:
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
        util.logger.setLevel(logging.DEBUG)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@ -13,7 +13,7 @@ from .. import util
 from ..language import Language
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, benchmark_cli, setup_gpu
@benchmark_cli.command(
@ -30,14 +30,12 @@ def benchmark_speed_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    # fmt: on
 ):
    """
    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
    data in the binary .spacy format.
    """
    import_code(code_path)
    setup_gpu(use_gpu=use_gpu, silent=False)
    nlp = util.load_model(model)
@ -91,7 +89,7 @@ class Quartiles:
 def annotate(
    nlp: Language, docs: List[Doc], batch_size: Optional[int]
 ) -> numpy.ndarray:
-    docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
+    docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
    wps = []
    while True:
        with time_context() as elapsed:
@ -173,5 +171,5 @@ def print_outliers(sample: numpy.ndarray):
 def warmup(
    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 ) -> numpy.ndarray:
-    docs = [doc.copy() for doc in docs * warmup_epochs]
+    docs = warmup_epochs * docs
    return annotate(nlp, docs, batch_size)
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -170,7 +170,7 @@ def debug_model(
        msg.divider(f"STEP 3 - prediction")
        msg.info(str(prediction))
-    msg.good(f"Successfully ended analysis - model looks good.")
+    msg.good(f"Succesfully ended analysis - model looks good.")
 def _sentences():
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,6 +1,5 @@
 import sys
 from typing import Optional, Sequence
 from urllib.parse import urljoin
 import requests
 import typer
@ -8,14 +7,7 @@ from wasabi import msg
 from .. import about
 from ..errors import OLD_MODEL_SHORTCUTS
-from ..util import (
+from ..util import get_minor_version, is_package, is_prerelease_version, run_command
    get_minor_version,
    is_in_interactive,
    is_in_jupyter,
    is_package,
    is_prerelease_version,
    run_command,
 )
 from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@ -64,13 +56,6 @@ def download(
        )
        pip_args = pip_args + ("--no-deps",)
    if direct:
        # Reject model names with '/', in order to prevent shenanigans.
        if "/" in model:
            msg.fail(
                title="Model download rejected",
                text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
                exits=True,
            )
        components = model.split("-")
        model_name = "".join(components[:-1])
        version = components[-1]
@ -92,27 +77,6 @@ def download(
        "Download and installation successful",
        f"You can now load the package via spacy.load('{model_name}')",
    )
    if is_in_jupyter():
        reload_deps_msg = (
            "If you are in a Jupyter or Colab notebook, you may need to "
            "restart Python in order to load all the package's dependencies. "
            "You can do this by selecting the 'Restart kernel' or 'Restart "
            "runtime' option."
        )
        msg.warn(
            "Restart to reload dependencies",
            reload_deps_msg,
        )
    elif is_in_interactive():
        reload_deps_msg = (
            "If you are in an interactive Python session, you may need to "
            "exit and restart Python to load all the package's dependencies. "
            "You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
        )
        msg.warn(
            "Restart to reload dependencies",
            reload_deps_msg,
        )
 def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
@ -161,16 +125,7 @@ def get_latest_version(model: str) -> str:
 def download_model(
    filename: str, user_pip_args: Optional[Sequence[str]] = None
 ) -> None:
-    # Construct the download URL carefully. We need to make sure we don't
+    download_url = about.__download_url__ + "/" + filename
    # allow relative paths or other shenanigans to trick us into download
    # from outside our own repo.
    base_url = about.__download_url__
    # urljoin requires that the path ends with /, or the last path part will be dropped
    if not base_url.endswith("/"):
        base_url = about.__download_url__ + "/"
    download_url = urljoin(base_url, filename)
    if not download_url.startswith(about.__download_url__):
        raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
    pip_args = list(user_pip_args) if user_pip_args is not None else []
    cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
    run_command(cmd)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -28,7 +28,6 @@ def evaluate_cli(
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
    spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
    # fmt: on
 ):
    """
@ -54,7 +53,6 @@ def evaluate_cli(
        displacy_limit=displacy_limit,
        per_component=per_component,
        silent=False,
        spans_key=spans_key,
    )
--- a/spacy/cli/find_function.py
+++ b/spacy/cli/find_function.py
@ -1,69 +0,0 @@
 from typing import Optional, Tuple
 from catalogue import RegistryError
 from wasabi import msg
 from ..util import registry
 from ._util import Arg, Opt, app
@app.command("find-function")
 def find_function_cli(
    # fmt: off
    func_name: str = Arg(..., help="Name of the registered function."),
    registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."),
    # fmt: on
 ):
    """
    Find the module, path and line number to the file the registered
    function is defined in, if available.
    func_name (str): Name of the registered function.
    registry_name (Optional[str]): Name of the catalogue registry.
    DOCS: https://spacy.io/api/cli#find-function
    """
    if not registry_name:
        registry_names = registry.get_registry_names()
        for name in registry_names:
            if registry.has(name, func_name):
                registry_name = name
                break
    if not registry_name:
        msg.fail(
            f"Couldn't find registered function: '{func_name}'",
            exits=1,
        )
    assert registry_name is not None
    find_function(func_name, registry_name)
 def find_function(func_name: str, registry_name: str) -> Tuple[str, int]:
    registry_desc = None
    try:
        registry_desc = registry.find(registry_name, func_name)
    except RegistryError as e:
        msg.fail(
            f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'",
        )
        msg.fail(f"{e}", exits=1)
    assert registry_desc is not None
    registry_path = None
    line_no = None
    if registry_desc["file"]:
        registry_path = registry_desc["file"]
        line_no = registry_desc["line_no"]
    if not registry_path or not line_no:
        msg.fail(
            f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'",
            exits=1,
        )
    assert registry_path is not None
    assert line_no is not None
    msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}")
    return str(registry_path), int(line_no)
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -39,7 +39,7 @@ def find_threshold_cli(
    # fmt: on
 ):
    """
-    Runs prediction trials for a trained model with varying thresholds to maximize
+    Runs prediction trials for a trained model with varying tresholds to maximize
    the specified metric. The search space for the threshold is traversed linearly
    from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
    (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@ -52,8 +52,8 @@ def find_threshold_cli(
    DOCS: https://spacy.io/api/cli#find-threshold
    """
-    if verbose:
+
-        util.logger.setLevel(logging.DEBUG)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    import_code(code_path)
    find_threshold(
        model=model,
@ -81,7 +81,7 @@ def find_threshold(
    silent: bool = True,
 ) -> Tuple[float, float, Dict[float, float]]:
    """
-    Runs prediction trials for models with varying thresholds to maximize the specified metric.
+    Runs prediction trials for models with varying tresholds to maximize the specified metric.
    model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
    data_path (Path): Path to file with DocBin with docs to use for threshold search.
    pipe_name (str): Name of pipe to examine thresholds for.
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -39,8 +39,7 @@ def init_vectors_cli(
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
-    if verbose:
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
        util.logger.setLevel(logging.DEBUG)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
@ -88,8 +87,7 @@ def init_pipeline_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
-    if verbose:
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
@ -118,8 +116,7 @@ def init_labels_cli(
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
-    if verbose:
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
        util.logger.setLevel(logging.DEBUG)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    overrides = parse_config_overrides(ctx.args)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,7 +1,5 @@
 import os
 import re
 import shutil
 import subprocess
 import sys
 from collections import defaultdict
 from pathlib import Path
@ -13,7 +11,6 @@ from thinc.api import Config
 from wasabi import MarkdownRenderer, Printer, get_raw_input
 from .. import about, util
 from ..compat import importlib_metadata
 from ..schemas import ModelMetaSchema, validate
 from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@ -30,7 +27,6 @@ def package_cli(
    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
    build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
    require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"),
    # fmt: on
 ):
    """
@ -39,7 +35,7 @@ def package_cli(
    specified output directory, and the data will be copied over. If
    --create-meta is set and a meta.json already exists in the output directory,
    the existing values will be used as the defaults in the command-line prompt.
-    After packaging, "python -m build --sdist" is run in the package directory,
+    After packaging, "python setup.py sdist" is run in the package directory,
    which will create a .tar.gz archive that can be installed via "pip install".
    If additional code files are provided (e.g. Python files containing custom
@ -61,7 +57,6 @@ def package_cli(
        create_sdist=create_sdist,
        create_wheel=create_wheel,
        force=force,
        require_parent=require_parent,
        silent=False,
    )
@ -76,7 +71,6 @@ def package(
    create_meta: bool = False,
    create_sdist: bool = True,
    create_wheel: bool = False,
    require_parent: bool = False,
    force: bool = False,
    silent: bool = True,
 ) -> None:
@ -84,17 +78,9 @@ def package(
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
-    if create_wheel and not has_wheel() and not has_build():
+    if create_wheel and not has_wheel():
-        err = (
+        err = "Generating a binary .whl file requires wheel to be installed"
-            "Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
+        msg.fail(err, "pip install wheel", exits=1)
        )
        msg.fail(err, "pip install build", exits=1)
    if not has_build():
        msg.warn(
            "Generating packages without the 'build' package is deprecated and "
            "will not be supported in the future. To install 'build': pip "
            "install build"
        )
    if not input_path or not input_path.exists():
        msg.fail("Can't locate pipeline data", input_path, exits=1)
    if not output_path or not output_path.exists():
@ -116,7 +102,7 @@ def package(
    if not meta_path.exists() or not meta_path.is_file():
        msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
-    meta = get_meta(input_dir, meta, require_parent=require_parent)
+    meta = get_meta(input_dir, meta)
    if meta["requirements"]:
        msg.good(
            f"Including {len(meta['requirements'])} package requirement(s) from "
@ -189,7 +175,6 @@ def package(
        imports.append(code_path.stem)
        shutil.copy(str(code_path), str(package_path))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    init_py = TEMPLATE_INIT.format(
@ -199,37 +184,12 @@ def package(
    msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
    if create_sdist:
        with util.working_dir(main_path):
            # run directly, since util.run_command is not designed to continue
            # after a command fails
            ret = subprocess.run(
                [sys.executable, "-m", "build", ".", "--sdist"],
                env=os.environ.copy(),
            )
            if ret.returncode != 0:
                msg.warn(
                    "Creating sdist with 'python -m build' failed. Falling "
                    "back to deprecated use of 'python setup.py sdist'"
                )
            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
        zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
        msg.good(f"Successfully created zipped Python package", zip_file)
    if create_wheel:
        with util.working_dir(main_path):
-            # run directly, since util.run_command is not designed to continue
+            util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
            # after a command fails
            ret = subprocess.run(
                [sys.executable, "-m", "build", ".", "--wheel"],
                env=os.environ.copy(),
            )
            if ret.returncode != 0:
                msg.warn(
                    "Creating wheel with 'python -m build' failed. Falling "
                    "back to deprecated use of 'wheel' with "
                    "'python setup.py bdist_wheel'"
                )
                util.run_command(
                    [sys.executable, "setup.py", "bdist_wheel"], capture=False
                )
        wheel_name_squashed = re.sub("_+", "_", model_name_v)
        wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
        msg.good(f"Successfully created binary wheel", wheel)
@ -249,17 +209,6 @@ def has_wheel() -> bool:
        return False
 def has_build() -> bool:
    # it's very likely that there is a local directory named build/ (especially
    # in an editable install), so an import check is not sufficient; instead
    # check that there is a package version
    try:
        importlib_metadata.version("build")
        return True
    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
        return False
 def get_third_party_dependencies(
    config: Config, exclude: List[str] = util.SimpleFrozenList()
 ) -> List[str]:
@ -306,8 +255,6 @@ def get_third_party_dependencies(
                modules.add(func_info["module"].split(".")[0])  # type: ignore[union-attr]
    dependencies = []
    for module_name in modules:
        if module_name == about.__title__:
            continue
        if module_name in distributions:
            dist = distributions.get(module_name)
            if dist:
@ -338,9 +285,7 @@ def create_file(file_path: Path, contents: str) -> None:
 def get_meta(
-    model_path: Union[str, Path],
+    model_path: Union[str, Path], existing_meta: Dict[str, Any]
    existing_meta: Dict[str, Any],
    require_parent: bool = False,
 ) -> Dict[str, Any]:
    meta: Dict[str, Any] = {
        "lang": "en",
@ -369,8 +314,6 @@ def get_meta(
    existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
    reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
    meta["requirements"].extend(reqs)
    if require_parent and about.__title__ not in meta["requirements"]:
        meta["requirements"].append(about.__title__ + meta["spacy_version"])
    return meta
@ -460,7 +403,7 @@ def _format_sources(data: Any) -> str:
        if author:
            result += " ({})".format(author)
        sources.append(result)
-    return "<br>".join(sources)
+    return "<br />".join(sources)
 def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
@ -545,11 +488,8 @@ def list_files(data_dir):
 def list_requirements(meta):
-    # Up to version 3.7, we included the parent package
+    parent_package = meta.get('parent_package', 'spacy')
-    # in requirements by default. This behaviour is removed
+    requirements = [parent_package + meta['spacy_version']]
    # in 3.8, with a setting to include the parent package in
    # the requirements list in the meta if desired.
    requirements = []
    if 'setup_requires' in meta:
        requirements += meta['setup_requires']
    if 'requirements' in meta:
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -71,7 +71,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
 def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
-    for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16):
+    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -1 +1,217 @@
-from weasel.cli.assets import *
+import os
 import re
 import shutil
 from pathlib import Path
 from typing import Any, Dict, Optional
 import requests
 import typer
 from wasabi import msg
 from ...util import ensure_path, working_dir
 from .._util import (
    PROJECT_FILE,
    Arg,
    Opt,
    SimpleFrozenDict,
    download_file,
    get_checksum,
    get_git_version,
    git_checkout,
    load_project_config,
    parse_config_overrides,
    project_cli,
 )
 # Whether assets are extra if `extra` is not set.
 EXTRA_DEFAULT = False
@project_cli.command(
    "assets",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def project_assets_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
    # fmt: on
 ):
    """Fetch project assets like datasets and pretrained weights. Assets are
    defined in the "assets" section of the project.yml. If a checksum is
    provided in the project.yml, the file is only downloaded if no local file
    with the same checksum exists.
    DOCS: https://spacy.io/api/cli#project-assets
    """
    overrides = parse_config_overrides(ctx.args)
    project_assets(
        project_dir,
        overrides=overrides,
        sparse_checkout=sparse_checkout,
        extra=extra,
    )
 def project_assets(
    project_dir: Path,
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    sparse_checkout: bool = False,
    extra: bool = False,
 ) -> None:
    """Fetch assets for a project using DVC if possible.
    project_dir (Path): Path to project directory.
    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
                            needed.
    extra (bool): Whether to download all assets, including those marked as 'extra'.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path, overrides=overrides)
    assets = [
        asset
        for asset in config.get("assets", [])
        if extra or not asset.get("extra", EXTRA_DEFAULT)
    ]
    if not assets:
        msg.warn(
            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
            exits=0,
        )
    msg.info(f"Fetching {len(assets)} asset(s)")
    for asset in assets:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            git_err = (
                f"Cloning spaCy project templates requires Git and the 'git' command. "
                f"Make sure it's installed and that the executable is available."
            )
            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
                    msg.good(
                        f"Skipping download with matching checksum: {asset['dest']}"
                    )
                    continue
                else:
                    if dest.is_dir():
                        shutil.rmtree(dest)
                    else:
                        dest.unlink()
            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
                msg.fail(
                    "A git asset must include 'repo', the repository address.", exits=1
                )
            if "path" not in asset["git"] or asset["git"]["path"] is None:
                msg.fail(
                    "A git asset must include 'path' - use \"\" to get the entire repository.",
                    exits=1,
                )
            git_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
                dest,
                branch=asset["git"].get("branch"),
                sparse=sparse_checkout,
            )
            msg.good(f"Downloaded asset {dest}")
        else:
            url = asset.get("url")
            if not url:
                # project.yml defines asset without URL that the user has to place
                check_private_asset(dest, checksum)
                continue
            fetch_asset(project_path, url, dest, checksum)
 def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
    """Check and validate assets without a URL (private assets that the user
    has to provide themselves) and give feedback about the checksum.
    dest (Path): Destination path of the asset.
    checksum (Optional[str]): Optional checksum of the expected file.
    """
    if not Path(dest).exists():
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
        msg.warn(err)
    else:
        if not checksum:
            msg.good(f"Asset already exists: {dest}")
        elif checksum == get_checksum(dest):
            msg.good(f"Asset exists with matching checksum: {dest}")
        else:
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
 def fetch_asset(
    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
 ) -> None:
    """Fetch an asset from a given URL or path. If a checksum is provided and a
    local file exists, it's only re-downloaded if the checksum doesn't match.
    project_path (Path): Path to project directory.
    url (str): URL or path to asset.
    checksum (Optional[str]): Optional expected checksum of local file.
    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
        the asset failed.
    """
    dest_path = (project_path / dest).resolve()
    if dest_path.exists():
        # If there's already a file, check for checksum
        if checksum:
            if checksum == get_checksum(dest_path):
                msg.good(f"Skipping download with matching checksum: {dest}")
                return
        else:
            # If there's not a checksum, make sure the file is a possibly valid size
            if os.path.getsize(dest_path) == 0:
                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
                os.remove(dest_path)
    # We might as well support the user here and create parent directories in
    # case the asset dir isn't listed as a dir to create in the project.yml
    if not dest_path.parent.exists():
        dest_path.parent.mkdir(parents=True)
    with working_dir(project_path):
        url = convert_asset_url(url)
        try:
            download_file(url, dest_path)
            msg.good(f"Downloaded asset {dest}")
        except requests.exceptions.RequestException as e:
            if Path(url).exists() and Path(url).is_file():
                # If it's a local file, copy to destination
                shutil.copy(url, str(dest_path))
                msg.good(f"Copied local asset {dest}")
            else:
                msg.fail(f"Download failed: {dest}", e)
    if checksum and checksum != get_checksum(dest_path):
        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
 def convert_asset_url(url: str) -> str:
    """Check and convert the asset URL if needed.
    url (str): The asset URL.
    RETURNS (str): The converted URL.
    """
    # If the asset URL is a regular GitHub URL it's likely a mistake
    if (
        re.match(r"(http(s?)):\/\/github.com", url)
        and "releases/download" not in url
        and "/raw/" not in url
    ):
        converted = url.replace("github.com", "raw.githubusercontent.com")
        converted = re.sub(r"/(tree|blob)/", "/", converted)
        msg.warn(
            "Downloading from a regular GitHub URL. This will only download "
            "the source of the page, not the actual file. Converting the URL "
            "to a raw URL.",
            converted,
        )
        return converted
    return url
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -1 +1,124 @@
-from weasel.cli.clone import *
+import re
 import subprocess
 from pathlib import Path
 from typing import Optional
 from wasabi import msg
 from ... import about
 from ...util import ensure_path
 from .._util import (
    COMMAND,
    PROJECT_FILE,
    Arg,
    Opt,
    get_git_version,
    git_checkout,
    git_repo_branch_exists,
    project_cli,
 )
 DEFAULT_REPO = about.__projects__
 DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 DEFAULT_BRANCHES = ["main", "master"]
@project_cli.command("clone")
 def project_clone_cli(
    # fmt: off
    name: str = Arg(..., help="The name of the template to clone"),
    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
    only download the files from the given subdirectory. The GitHub repo
    defaults to the official spaCy template repo, but can be customized
    (including using a private repo).
    DOCS: https://spacy.io/api/cli#project-clone
    """
    if dest is None:
        dest = Path.cwd() / Path(name).parts[-1]
    if repo == DEFAULT_REPO and branch is None:
        branch = DEFAULT_PROJECTS_BRANCH
    if branch is None:
        for default_branch in DEFAULT_BRANCHES:
            if git_repo_branch_exists(repo, default_branch):
                branch = default_branch
                break
        if branch is None:
            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
            msg.fail(
                "No branch provided and attempted default "
                f"branches {default_branches_msg} do not exist.",
                exits=1,
            )
    else:
        if not git_repo_branch_exists(repo, branch):
            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
    assert isinstance(branch, str)
    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
 def project_clone(
    name: str,
    dest: Path,
    *,
    repo: str = about.__projects__,
    branch: str = about.__projects_branch__,
    sparse_checkout: bool = False,
 ) -> None:
    """Clone a project template from a repository.
    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
    branch (str): The branch to clone from
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
    try:
        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
    except subprocess.CalledProcessError:
        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
        msg.fail(err, exits=1)
    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
    if not (project_dir / PROJECT_FILE).exists():
        msg.warn(f"No {PROJECT_FILE} found in directory")
    else:
        msg.good(f"Your project is now ready!")
        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
 def check_clone(name: str, dest: Path, repo: str) -> None:
    """Check and validate that the destination path can be used to clone. Will
    check that Git is available and that the destination path is suitable.
    name (str): Name of the directory to clone from the repo.
    dest (Path): Local destination of cloned directory.
    repo (str): URL of the repo to clone from.
    """
    git_err = (
        f"Cloning spaCy project templates requires Git and the 'git' command. "
        f"To clone a project without Git, copy the files from the '{name}' "
        f"directory in the {repo} to {dest} manually."
    )
    get_git_version(error=git_err)
    if not dest:
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
    if dest.exists():
        # Directory already exists (not allowed, clone needs to create it)
        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
    if not dest.parent.exists():
        # We're not creating parents, parent dir should exist
        msg.fail(
            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
            f"Create the necessary folder(s) first before continuing.",
            exits=1,
        )
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@ -1 +1,115 @@
-from weasel.cli.document import *
+from pathlib import Path
 from wasabi import MarkdownRenderer, msg
 from ...util import working_dir
 from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
 DOCS_URL = "https://spacy.io"
 INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 project, as well as the available commands and workflows. For details, see the
 [spaCy projects documentation]({DOCS_URL}/usage/projects)."""
 INTRO_COMMANDS = f"""The following commands are defined by the project. They
 can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
 Commands are only re-run if their inputs have changed."""
 INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
 can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
 and will run the specified commands in order. Commands are only re-run if their
 inputs have changed."""
 INTRO_ASSETS = f"""The following assets are defined by the project. They can
 be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
 in the project directory."""
 # These markers are added to the Markdown and can be used to update the file in
 # place if it already exists. Only the auto-generated part will be replaced.
 MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
 MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
 # If this marker is used in an existing README, it's ignored and not replaced
 MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
@project_cli.command("document")
 def project_document_cli(
    # fmt: off
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
    # fmt: on
 ):
    """
    Auto-generate a README.md for a project. If the content is saved to a file,
    hidden markers are added so you can add custom content before or after the
    auto-generated section and only the auto-generated docs will be replaced
    when you re-run the command.
    DOCS: https://spacy.io/api/cli#project-document
    """
    project_document(project_dir, output_file, no_emoji=no_emoji)
 def project_document(
    project_dir: Path, output_file: Path, *, no_emoji: bool = False
 ) -> None:
    is_stdout = str(output_file) == "-"
    config = load_project_config(project_dir)
    md = MarkdownRenderer(no_emoji=no_emoji)
    md.add(MARKER_START)
    title = config.get("title")
    description = config.get("description")
    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
    if description:
        md.add(description)
    md.add(md.title(2, PROJECT_FILE, "📋"))
    md.add(INTRO_PROJECT)
    # Commands
    cmds = config.get("commands", [])
    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
    if data:
        md.add(md.title(3, "Commands", "⏯"))
        md.add(INTRO_COMMANDS)
        md.add(md.table(data, ["Command", "Description"]))
    # Workflows
    wfs = config.get("workflows", {}).items()
    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
    if data:
        md.add(md.title(3, "Workflows", "⏭"))
        md.add(INTRO_WORKFLOWS)
        md.add(md.table(data, ["Workflow", "Steps"]))
    # Assets
    assets = config.get("assets", [])
    data = []
    for a in assets:
        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
        dest_path = a["dest"]
        dest = md.code(dest_path)
        if source == "Local":
            # Only link assets if they're in the repo
            with working_dir(project_dir) as p:
                if (p / dest_path).exists():
                    dest = md.link(dest, dest_path)
        data.append((dest, source, a.get("description", "")))
    if data:
        md.add(md.title(3, "Assets", "🗂"))
        md.add(INTRO_ASSETS)
        md.add(md.table(data, ["File", "Source", "Description"]))
    md.add(MARKER_END)
    # Output result
    if is_stdout:
        print(md.text)
    else:
        content = md.text
        if output_file.exists():
            with output_file.open("r", encoding="utf8") as f:
                existing = f.read()
            if MARKER_IGNORE in existing:
                msg.warn("Found ignore marker in existing file: skipping", output_file)
                return
            if MARKER_START in existing and MARKER_END in existing:
                msg.info("Found existing file: only replacing auto-generated docs")
                before = existing.split(MARKER_START)[0]
                after = existing.split(MARKER_END)[1]
                content = f"{before}{content}{after}"
            else:
                msg.warn("Replacing existing file")
        with output_file.open("w", encoding="utf8") as f:
            f.write(content)
        msg.good("Saved project documentation", output_file)
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -1 +1,220 @@
-from weasel.cli.dvc import *
+"""This module contains helpers and subcommands for integrating spaCy projects
 with Data Version Controk (DVC). https://dvc.org"""
 import subprocess
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional
 from wasabi import msg
 from ...util import (
    SimpleFrozenList,
    join_command,
    run_command,
    split_command,
    working_dir,
 )
 from .._util import (
    COMMAND,
    NAME,
    PROJECT_FILE,
    Arg,
    Opt,
    get_hash,
    load_project_config,
    project_cli,
 )
 DVC_CONFIG = "dvc.yaml"
 DVC_DIR = ".dvc"
 UPDATE_COMMAND = "dvc"
 DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
 # edited your {PROJECT_FILE}, you can regenerate this file by running:
 # {COMMAND} project {UPDATE_COMMAND}"""
@project_cli.command(UPDATE_COMMAND)
 def project_update_dvc_cli(
    # fmt: off
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
    """Auto-generate Data Version Control (DVC) config. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. If no workflow is specified, the first defined
    workflow is used. The DVC config will only be updated if the project.yml
    changed.
    DOCS: https://spacy.io/api/cli#project-dvc
    """
    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
 def project_update_dvc(
    project_dir: Path,
    workflow: Optional[str] = None,
    *,
    verbose: bool = False,
    quiet: bool = False,
    force: bool = False,
 ) -> None:
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. Will only update the file if the checksum changed.
    project_dir (Path): The project directory.
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
        If not set, the first workflow will be used.
    verbose (bool): Print more info.
    quiet (bool): Print less info.
    force (bool): Force update DVC config.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(
        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
    )
    help_msg = "To execute the workflow with DVC, run: dvc repro"
    if updated:
        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
    else:
        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
 def update_dvc_config(
    path: Path,
    config: Dict[str, Any],
    workflow: Optional[str] = None,
    verbose: bool = False,
    quiet: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
    project directory. The file is auto-generated based on the config. The
    first line of the auto-generated file specifies the hash of the config
    dict, so if any of the config values change, the DVC config is regenerated.
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project.yml.
    verbose (bool): Whether to print additional info (via DVC).
    quiet (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
    ensure_dvc(path)
    workflows = config.get("workflows", {})
    workflow_names = list(workflows.keys())
    check_workflows(workflow_names, workflow)
    if not workflow:
        workflow = workflow_names[0]
    config_hash = get_hash(config)
    path = path.resolve()
    dvc_config_path = path / DVC_CONFIG
    if dvc_config_path.exists():
        # Check if the file was generated using the current config, if not, redo
        with dvc_config_path.open("r", encoding="utf8") as f:
            ref_hash = f.readline().strip().replace("# ", "")
        if ref_hash == config_hash and not force:
            return False  # Nothing has changed in project.yml, don't need to update
        dvc_config_path.unlink()
    dvc_commands = []
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    # some flags that apply to every command
    flags = []
    if verbose:
        flags.append("--verbose")
    if quiet:
        flags.append("--quiet")
    for name in workflows[workflow]:
        command = config_commands[name]
        deps = command.get("deps", [])
        outputs = command.get("outputs", [])
        outputs_no_cache = command.get("outputs_no_cache", [])
        if not deps and not outputs and not outputs_no_cache:
            continue
        # Default to the working dir as the project path since dvc.yaml is auto-generated
        # and we don't want arbitrary paths in there
        project_cmd = ["python", "-m", NAME, "project", "run", name]
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
        if command.get("no_skip"):
            dvc_cmd.append("--always-changed")
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        dvc_commands.append(join_command(full_cmd))
    if not dvc_commands:
        # If we don't check for this, then there will be an error when reading the
        # config, since DVC wouldn't create it.
        msg.fail(
            "No usable commands for DVC found. This can happen if none of your "
            "commands have dependencies or outputs.",
            exits=1,
        )
    with working_dir(path):
        for c in dvc_commands:
            dvc_command = "dvc " + c
            run_command(dvc_command)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
    return True
 def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
    """Validate workflows provided in project.yml and check that a given
    workflow can be used to generate a DVC config.
    workflows (List[str]): Names of the available workflows.
    workflow (Optional[str]): The name of the workflow to convert.
    """
    if not workflows:
        msg.fail(
            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
            f"define at least one list of commands.",
            exits=1,
        )
    if workflow is not None and workflow not in workflows:
        msg.fail(
            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
            f"Available workflows: {', '.join(workflows)}",
            exits=1,
        )
    if not workflow:
        msg.warn(
            f"No workflow specified for DVC pipeline. Using the first workflow "
            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
        )
 def ensure_dvc(project_dir: Path) -> None:
    """Ensure that the "dvc" command is available and that the current project
    directory is an initialized DVC project.
    """
    try:
        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            "To use spaCy projects with DVC (Data Version Control), DVC needs "
            "to be installed and the 'dvc' command needs to be available",
            "You can install the Python package from pip (pip install dvc) or "
            "conda (conda install -c conda-forge dvc). For more details, see the "
            "documentation: https://dvc.org/doc/install",
            exits=1,
        )
    if not (project_dir / ".dvc").exists():
        msg.fail(
            "Project not initialized as a DVC project",
            "To initialize a DVC project, you can run 'dvc init' in the project "
            "directory. For more details, see the documentation: "
            "https://dvc.org/doc/command-reference/init",
            exits=1,
        )
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@ -1 +1,67 @@
-from weasel.cli.pull import *
+from pathlib import Path
 from wasabi import msg
 from .._util import Arg, load_project_config, logger, project_cli
 from .remote_storage import RemoteStorage, get_command_hash
 from .run import update_lockfile
@project_cli.command("pull")
 def project_pull_cli(
    # fmt: off
    remote: str = Arg("default", help="Name or path of remote storage"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Retrieve available precomputed outputs from a remote storage.
    You can alias remotes in your project.yml by mapping them to storage paths.
    A storage can be anything that the smart-open library can upload to, e.g.
    AWS, Google Cloud Storage, SSH, local directories etc.
    DOCS: https://spacy.io/api/cli#project-pull
    """
    for url, output_path in project_pull(project_dir, remote):
        if url is not None:
            msg.good(f"Pulled {output_path} from {url}")
 def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
    # TODO: We don't have tests for this :(. It would take a bit of mockery to
    # set up. I guess see if it breaks first?
    config = load_project_config(project_dir)
    if remote in config.get("remotes", {}):
        remote = config["remotes"][remote]
    storage = RemoteStorage(project_dir, remote)
    commands = list(config.get("commands", []))
    # We use a while loop here because we don't know how the commands
    # will be ordered. A command might need dependencies from one that's later
    # in the list.
    while commands:
        for i, cmd in enumerate(list(commands)):
            logger.debug("CMD: %s.", cmd["name"])
            deps = [project_dir / dep for dep in cmd.get("deps", [])]
            if all(dep.exists() for dep in deps):
                cmd_hash = get_command_hash("", "", deps, cmd["script"])
                for output_path in cmd.get("outputs", []):
                    url = storage.pull(output_path, command_hash=cmd_hash)
                    logger.debug(
                        "URL: %s for %s with command hash %s",
                        url,
                        output_path,
                        cmd_hash,
                    )
                    yield url, output_path
                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
                if all(loc.exists() for loc in out_locs):
                    update_lockfile(project_dir, cmd)
                # We remove the command from the list here, and break, so that
                # we iterate over the loop again.
                commands.pop(i)
                break
            else:
                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
        else:
            # If we didn't break the for loop, break the while loop.
            break
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@ -1 +1,69 @@
-from weasel.cli.push import *
+from pathlib import Path
 from wasabi import msg
 from .._util import Arg, load_project_config, logger, project_cli
 from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
@project_cli.command("push")
 def project_push_cli(
    # fmt: off
    remote: str = Arg("default", help="Name or path of remote storage"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Persist outputs to a remote storage. You can alias remotes in your
    project.yml by mapping them to storage paths. A storage can be anything that
    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
    local directories etc.
    DOCS: https://spacy.io/api/cli#project-push
    """
    for output_path, url in project_push(project_dir, remote):
        if url is None:
            msg.info(f"Skipping {output_path}")
        else:
            msg.good(f"Pushed {output_path} to {url}")
 def project_push(project_dir: Path, remote: str):
    """Persist outputs to a remote storage. You can alias remotes in your project.yml
    by mapping them to storage paths. A storage can be anything that the smart-open
    library can upload to, e.g. gcs, aws, ssh, local directories etc
    """
    config = load_project_config(project_dir)
    if remote in config.get("remotes", {}):
        remote = config["remotes"][remote]
    storage = RemoteStorage(project_dir, remote)
    for cmd in config.get("commands", []):
        logger.debug("CMD: %s", cmd["name"])
        deps = [project_dir / dep for dep in cmd.get("deps", [])]
        if any(not dep.exists() for dep in deps):
            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
            continue
        cmd_hash = get_command_hash(
            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
        )
        logger.debug("CMD_HASH: %s", cmd_hash)
        for output_path in cmd.get("outputs", []):
            output_loc = project_dir / output_path
            if output_loc.exists() and _is_not_empty_dir(output_loc):
                url = storage.push(
                    output_path,
                    command_hash=cmd_hash,
                    content_hash=get_content_hash(output_loc),
                )
                logger.debug(
                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
                )
                yield output_path, url
 def _is_not_empty_dir(loc: Path):
    if not loc.is_dir():
        return True
    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
        return True
    else:
        return False
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@ -1 +1,212 @@
-from weasel.cli.remote_storage import *
+import hashlib
 import os
 import site
 import tarfile
 import urllib.parse
 from pathlib import Path
 from typing import TYPE_CHECKING, Dict, List, Optional
 from wasabi import msg
 from ... import about
 from ...errors import Errors
 from ...git_info import GIT_VERSION
 from ...util import ENV_VARS, check_bool_env_var, get_minor_version
 from .._util import (
    download_file,
    ensure_pathy,
    get_checksum,
    get_hash,
    make_tempdir,
    upload_file,
 )
 if TYPE_CHECKING:
    from pathy import FluidPath  # noqa: F401
 class RemoteStorage:
    """Push and pull outputs to and from a remote file storage.
    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
    ssh, etc.
    """
    def __init__(self, project_root: Path, url: str, *, compression="gz"):
        self.root = project_root
        self.url = ensure_pathy(url)
        self.compression = compression
    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
        """Compress a file or directory within a project and upload it to a remote
        storage. If an object exists at the full URL, nothing is done.
        Within the remote storage, files are addressed by their project path
        (url encoded) and two user-supplied hashes, representing their creation
        context and their file contents. If the URL already exists, the data is
        not uploaded. Paths are archived and compressed prior to upload.
        """
        loc = self.root / path
        if not loc.exists():
            raise IOError(f"Cannot push {loc}: does not exist.")
        url = self.make_url(path, command_hash, content_hash)
        if url.exists():
            return url
        tmp: Path
        with make_tempdir() as tmp:
            tar_loc = tmp / self.encode_name(str(path))
            mode_string = f"w:{self.compression}" if self.compression else "w"
            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                tar_file.add(str(loc), arcname=str(path))
            upload_file(tar_loc, url)
        return url
    def pull(
        self,
        path: Path,
        *,
        command_hash: Optional[str] = None,
        content_hash: Optional[str] = None,
    ) -> Optional["FluidPath"]:
        """Retrieve a file from the remote cache. If the file already exists,
        nothing is done.
        If the command_hash and/or content_hash are specified, only matching
        results are returned. If no results are available, an error is raised.
        """
        dest = self.root / path
        if dest.exists():
            return None
        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
        if url is None:
            return url
        else:
            # Make sure the destination exists
            if not dest.parent.exists():
                dest.parent.mkdir(parents=True)
            tmp: Path
            with make_tempdir() as tmp:
                tar_loc = tmp / url.parts[-1]
                download_file(url, tar_loc)
                mode_string = f"r:{self.compression}" if self.compression else "r"
                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                    # This requires that the path is added correctly, relative
                    # to root. This is how we set things up in push()
                    # Disallow paths outside the current directory for the tar
                    # file (CVE-2007-4559, directory traversal vulnerability)
                    def is_within_directory(directory, target):
                        abs_directory = os.path.abspath(directory)
                        abs_target = os.path.abspath(target)
                        prefix = os.path.commonprefix([abs_directory, abs_target])
                        return prefix == abs_directory
                    def safe_extract(tar, path):
                        for member in tar.getmembers():
                            member_path = os.path.join(path, member.name)
                            if not is_within_directory(path, member_path):
                                raise ValueError(Errors.E852)
                        tar.extractall(path)
                    safe_extract(tar_file, self.root)
        return url
    def find(
        self,
        path: Path,
        *,
        command_hash: Optional[str] = None,
        content_hash: Optional[str] = None,
    ) -> Optional["FluidPath"]:
        """Find the best matching version of a file within the storage,
        or `None` if no match can be found. If both the creation and content hash
        are specified, only exact matches will be returned. Otherwise, the most
        recent matching file is preferred.
        """
        name = self.encode_name(str(path))
        urls = []
        if command_hash is not None and content_hash is not None:
            url = self.url / name / command_hash / content_hash
            urls = [url] if url.exists() else []
        elif command_hash is not None:
            if (self.url / name / command_hash).exists():
                urls = list((self.url / name / command_hash).iterdir())
        else:
            if (self.url / name).exists():
                for sub_dir in (self.url / name).iterdir():
                    urls.extend(sub_dir.iterdir())
                if content_hash is not None:
                    urls = [url for url in urls if url.parts[-1] == content_hash]
        if len(urls) >= 2:
            try:
                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
            except Exception:
                msg.warn(
                    "Unable to sort remote files by last modified. The file(s) "
                    "pulled from the cache may not be the most recent."
                )
        return urls[-1] if urls else None
    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
        """Construct a URL from a subpath, a creation hash and a content hash."""
        return self.url / self.encode_name(str(path)) / command_hash / content_hash
    def encode_name(self, name: str) -> str:
        """Encode a subpath into a URL-safe name."""
        return urllib.parse.quote_plus(name)
 def get_content_hash(loc: Path) -> str:
    return get_checksum(loc)
 def get_command_hash(
    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
 ) -> str:
    """Create a hash representing the execution of a command. This includes the
    currently installed packages, whatever environment variables have been marked
    as relevant, and the command.
    """
    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
        spacy_v = GIT_VERSION
    else:
        spacy_v = str(get_minor_version(about.__version__) or "")
    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
    hashes.extend(cmd)
    creation_bytes = "".join(hashes).encode("utf8")
    return hashlib.md5(creation_bytes).hexdigest()
 def get_site_hash():
    """Hash the current Python environment's site-packages contents, including
    the name and version of the libraries. The list we're hashing is what
    `pip freeze` would output.
    """
    site_dirs = site.getsitepackages()
    if site.ENABLE_USER_SITE:
        site_dirs.extend(site.getusersitepackages())
    packages = set()
    for site_dir in site_dirs:
        site_dir = Path(site_dir)
        for subpath in site_dir.iterdir():
            if subpath.parts[-1].endswith("dist-info"):
                packages.add(subpath.parts[-1].replace(".dist-info", ""))
    package_bytes = "".join(sorted(packages)).encode("utf8")
    return hashlib.md5sum(package_bytes).hexdigest()
 def get_env_hash(env: Dict[str, str]) -> str:
    """Construct a hash of the environment variables that will be passed into
    the commands.
    Values in the env dict may be references to the current os.environ, using
    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
    """
    env_vars = {}
    for key, value in env.items():
        if value.startswith("$"):
            env_vars[key] = os.environ.get(value[1:], "")
        else:
            env_vars[key] = value
    return get_hash(env_vars)
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -1 +1,379 @@
-from weasel.cli.run import *
+import os.path
 import sys
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
 import srsly
 import typer
 from wasabi import msg
 from wasabi.util import locale_escape
 from ... import about
 from ...git_info import GIT_VERSION
 from ...util import (
    ENV_VARS,
    SimpleFrozenDict,
    SimpleFrozenList,
    check_bool_env_var,
    is_cwd,
    is_minor_version_match,
    join_command,
    run_command,
    split_command,
    working_dir,
 )
 from .._util import (
    COMMAND,
    PROJECT_FILE,
    PROJECT_LOCK,
    Arg,
    Opt,
    get_checksum,
    get_hash,
    load_project_config,
    parse_config_overrides,
    project_cli,
 )
@project_cli.command(
    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 )
 def project_run_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
    """Run a named command or workflow defined in the project.yml. If a workflow
    name is specified, all commands in the workflow are run, in order. If
    commands define dependencies and/or outputs, they will only be re-run if
    state has changed.
    DOCS: https://spacy.io/api/cli#project-run
    """
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
    else:
        overrides = parse_config_overrides(ctx.args)
        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
 def project_run(
    project_dir: Path,
    subcommand: str,
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
    skip_requirements_check: bool = False,
 ) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.
    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    overrides (Dict[str, Any]): Optional config overrides.
    force (bool): Force re-running, even if nothing changed.
    dry (bool): Perform a dry run and don't execute commands.
    capture (bool): Whether to capture the output and errors of individual commands.
        If False, the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    skip_requirements_check (bool): Whether to skip the requirements check.
    """
    config = load_project_config(project_dir, overrides=overrides)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
    req_path = project_dir / "requirements.txt"
    if not skip_requirements_check:
        if config.get("check_requirements", True) and os.path.exists(req_path):
            with req_path.open() as requirements_file:
                _check_requirements([req.strip() for req in requirements_file])
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
            project_run(
                project_dir,
                cmd,
                overrides=overrides,
                force=force,
                dry=dry,
                capture=capture,
                skip_requirements_check=True,
            )
    else:
        cmd = commands[subcommand]
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                err_exits = 1 if not dry else None
                msg.fail(err, err_help, exits=err_exits)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                run_commands(cmd["script"], dry=dry, capture=capture)
                if not dry:
                    update_lockfile(current_dir, cmd)
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
    """Simulate a CLI help prompt using the info available in the project.yml.
    project_dir (Path): The project directory.
    subcommand (Optional[str]): The subcommand or None. If a subcommand is
        provided, the subcommand help is shown. Otherwise, the top-level help
        and a list of available commands is printed.
    """
    config = load_project_config(project_dir)
    config_commands = config.get("commands", [])
    commands = {cmd["name"]: cmd for cmd in config_commands}
    workflows = config.get("workflows", {})
    project_loc = "" if is_cwd(project_dir) else project_dir
    if subcommand:
        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
        if subcommand in commands:
            help_text = commands[subcommand].get("help")
            if help_text:
                print(f"\n{help_text}\n")
        elif subcommand in workflows:
            steps = workflows[subcommand]
            print(f"\nWorkflow consisting of {len(steps)} commands:")
            steps_data = [
                (f"{i + 1}. {step}", commands[step].get("help", ""))
                for i, step in enumerate(steps)
            ]
            msg.table(steps_data)
            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
            print(f"For command details, run: {help_cmd}")
    else:
        print("")
        title = config.get("title")
        if title:
            print(f"{locale_escape(title)}\n")
        if config_commands:
            print(f"Available commands in {PROJECT_FILE}")
            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
        if workflows:
            print(f"Available workflows in {PROJECT_FILE}")
            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
 def run_commands(
    commands: Iterable[str] = SimpleFrozenList(),
    silent: bool = False,
    dry: bool = False,
    capture: bool = False,
 ) -> None:
    """Run a sequence of commands in a subprocess, in order.
    commands (List[str]): The string commands.
    silent (bool): Don't print the commands.
    dry (bool): Perform a dry run and don't execut anything.
    capture (bool): Whether to capture the output and errors of individual commands.
        If False, the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    """
    for c in commands:
        command = split_command(c)
        # Not sure if this is needed or a good idea. Motivation: users may often
        # use commands in their config that reference "python" and we want to
        # make sure that it's always executing the same Python that spaCy is
        # executed with and the pip in the same env, not some other Python/pip.
        # Also ensures cross-compatibility if user 1 writes "python3" (because
        # that's how it's set up on their system), and user 2 without the
        # shortcut tries to re-run the command.
        if len(command) and command[0] in ("python", "python3"):
            command[0] = sys.executable
        elif len(command) and command[0] in ("pip", "pip3"):
            command = [sys.executable, "-m", "pip", *command[1:]]
        if not silent:
            print(f"Running command: {join_command(command)}")
        if not dry:
            run_command(command, capture=capture)
 def validate_subcommand(
    commands: Sequence[str], workflows: Sequence[str], subcommand: str
 ) -> None:
    """Check that a subcommand is valid and defined. Raises an error otherwise.
    commands (Sequence[str]): The available commands.
    subcommand (str): The subcommand.
    """
    if not commands and not workflows:
        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
    if subcommand not in commands and subcommand not in workflows:
        help_msg = []
        if subcommand in ["assets", "asset"]:
            help_msg.append("Did you mean to run: python -m spacy project assets?")
        if commands:
            help_msg.append(f"Available commands: {', '.join(commands)}")
        if workflows:
            help_msg.append(f"Available workflows: {', '.join(workflows)}")
        msg.fail(
            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
            ". ".join(help_msg),
            exits=1,
        )
 def check_rerun(
    project_dir: Path,
    command: Dict[str, Any],
    *,
    check_spacy_version: bool = True,
    check_spacy_commit: bool = False,
 ) -> bool:
    """Check if a command should be rerun because its settings or inputs/outputs
    changed.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    strict_version (bool):
    RETURNS (bool): Whether to re-run the command.
    """
    # Always rerun if no-skip is set
    if command.get("no_skip", False):
        return True
    lock_path = project_dir / PROJECT_LOCK
    if not lock_path.exists():  # We don't have a lockfile, run command
        return True
    data = srsly.read_yaml(lock_path)
    if command["name"] not in data:  # We don't have info about this command
        return True
    entry = data[command["name"]]
    # Always run commands with no outputs (otherwise they'd always be skipped)
    if not entry.get("outs", []):
        return True
    # Always rerun if spaCy version or commit hash changed
    spacy_v = entry.get("spacy_version")
    commit = entry.get("spacy_git_version")
    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
        return True
    if check_spacy_commit and commit != GIT_VERSION:
        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
        return True
    # If the entry in the lockfile matches the lockfile entry that would be
    # generated from the current command, we don't rerun because it means that
    # all inputs/outputs, hashes and scripts are the same and nothing changed
    lock_entry = get_lock_entry(project_dir, command)
    exclude = ["spacy_version", "spacy_git_version"]
    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
 def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
    """Update the lockfile after running a command. Will create a lockfile if
    it doesn't yet exist and will add an entry for the current command, its
    script and dependencies/outputs.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    """
    lock_path = project_dir / PROJECT_LOCK
    if not lock_path.exists():
        srsly.write_yaml(lock_path, {})
        data = {}
    else:
        data = srsly.read_yaml(lock_path)
    data[command["name"]] = get_lock_entry(project_dir, command)
    srsly.write_yaml(lock_path, data)
 def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
    """Get a lockfile entry for a given command. An entry includes the command,
    the script (command steps) and a list of dependencies and outputs with
    their paths and file hashes, if available. The format is based on the
    dvc.lock files, to keep things consistent.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    RETURNS (Dict[str, Any]): The lockfile entry.
    """
    deps = get_fileinfo(project_dir, command.get("deps", []))
    outs = get_fileinfo(project_dir, command.get("outputs", []))
    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
    return {
        "cmd": f"{COMMAND} run {command['name']}",
        "script": command["script"],
        "deps": deps,
        "outs": [*outs, *outs_nc],
        "spacy_version": about.__version__,
        "spacy_git_version": GIT_VERSION,
    }
 def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
    """Generate the file information for a list of paths (dependencies, outputs).
    Includes the file path and the file's checksum.
    project_dir (Path): The current project directory.
    paths (List[str]): The file paths.
    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
    """
    data = []
    for path in paths:
        file_path = project_dir / path
        md5 = get_checksum(file_path) if file_path.exists() else None
        data.append({"path": path, "md5": md5})
    return data
 def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
    """Checks whether requirements are installed and free of version conflicts.
    requirements (List[str]): List of requirements.
    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
        exist.
    """
    import pkg_resources
    failed_pkgs_msgs: List[str] = []
    conflicting_pkgs_msgs: List[str] = []
    for req in requirements:
        try:
            pkg_resources.require(req)
        except pkg_resources.DistributionNotFound as dnf:
            failed_pkgs_msgs.append(dnf.report())
        except pkg_resources.VersionConflict as vc:
            conflicting_pkgs_msgs.append(vc.report())
        except Exception:
            msg.warn(
                f"Unable to check requirement: {req} "
                "Checks are currently limited to requirement specifiers "
                "(PEP 508)"
            )
    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
        msg.warn(
            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
            "correctly and you installed all requirements specified in your project's requirements.txt: "
        )
        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
            msg.text(pgk_msg)
    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -130,7 +130,7 @@ grad_factor = 1.0
 {% if "span_finder" in components -%}
 [components.span_finder]
 factory = "span_finder"
-max_length = 25
+max_length = null
 min_length = null
 scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
 spans_key = "sc"
@ -271,9 +271,8 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 [components.textcat.model.linear_model]
-@architectures = "spacy.TextCatBOW.v3"
+@architectures = "spacy.TextCatBOW.v2"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
@ -309,9 +308,8 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 [components.textcat_multilabel.model.linear_model]
-@architectures = "spacy.TextCatBOW.v3"
+@architectures = "spacy.TextCatBOW.v2"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
@ -421,7 +419,7 @@ width = ${components.tok2vec.model.encode.width}
 {% if "span_finder" in components %}
 [components.span_finder]
 factory = "span_finder"
-max_length = 25
+max_length = null
 min_length = null
 scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
 spans_key = "sc"
@ -544,15 +542,14 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 [components.textcat.model.linear_model]
-@architectures = "spacy.TextCatBOW.v3"
+@architectures = "spacy.TextCatBOW.v2"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
 {% else -%}
 [components.textcat.model]
-@architectures = "spacy.TextCatBOW.v3"
+@architectures = "spacy.TextCatBOW.v2"
 exclusive_classes = true
 ngram_size = 1
 no_output_layer = false
@ -573,17 +570,15 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 [components.textcat_multilabel.model.linear_model]
-@architectures = "spacy.TextCatBOW.v3"
+@architectures = "spacy.TextCatBOW.v2"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
 {% else -%}
 [components.textcat_multilabel.model]
-@architectures = "spacy.TextCatBOW.v3"
+@architectures = "spacy.TextCatBOW.v2"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
 {%- endif %}
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -47,8 +47,7 @@ def train_cli(
    DOCS: https://spacy.io/api/cli#train
    """
-    if verbose:
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -26,9 +26,6 @@ batch_size = 1000
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
 [nlp.vectors]
@vectors = "spacy.Vectors.v1"
 # The pipeline components and their models
 [components]
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,3 +1,4 @@
 import itertools
 import uuid
 from typing import Any, Dict, List, Optional, Tuple, Union
@ -142,25 +143,7 @@ class SpanRenderer:
        spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
        title (str / None): Document title set in Doc.user_data['title'].
        """
-        per_token_info = self._assemble_per_token_info(tokens, spans)
+        per_token_info = []
        markup = self._render_markup(per_token_info)
        markup = TPL_SPANS.format(content=markup, dir=self.direction)
        if title:
            markup = TPL_TITLE.format(title=title) + markup
        return markup
    @staticmethod
    def _assemble_per_token_info(
        tokens: List[str], spans: List[Dict[str, Any]]
    ) -> List[Dict[str, List[Dict[str, Any]]]]:
        """Assembles token info used to generate markup in render_spans().
        tokens (List[str]): Tokens in text.
        spans (List[Dict[str, Any]]): Spans in text.
        RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
            and spans.
        """
        per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
        # we must sort so that we can correctly describe when spans need to "stack"
        # which is determined by their start token, then span length (longer spans on top),
        # then break any remaining ties with the span label
@ -172,22 +155,21 @@ class SpanRenderer:
                s["label"],
            ),
        )
        for s in spans:
            # this is the vertical 'slot' that the span will be rendered in
            # vertical_position = span_label_offset + (offset_step * (slot - 1))
            s["render_slot"] = 0
        for idx, token in enumerate(tokens):
            # Identify if a token belongs to a Span (and which) and if it's a
            # start token of said Span. We'll use this for the final HTML render
            token_markup: Dict[str, Any] = {}
            token_markup["text"] = token
-            intersecting_spans: List[Dict[str, Any]] = []
+            concurrent_spans = 0
            entities = []
            for span in spans:
                ent = {}
                if span["start_token"] <= idx < span["end_token"]:
                    concurrent_spans += 1
                    span_start = idx == span["start_token"]
                    ent["label"] = span["label"]
                    ent["is_start"] = span_start
@ -195,12 +177,7 @@ class SpanRenderer:
                        # When the span starts, we need to know how many other
                        # spans are on the 'span stack' and will be rendered.
                        # This value becomes the vertical render slot for this entire span
-                        span["render_slot"] = (
+                        span["render_slot"] = concurrent_spans
                            intersecting_spans[-1]["render_slot"]
                            if len(intersecting_spans)
                            else 0
                        ) + 1
                    intersecting_spans.append(span)
                    ent["render_slot"] = span["render_slot"]
                    kb_id = span.get("kb_id", "")
                    kb_url = span.get("kb_url", "#")
@ -217,8 +194,11 @@ class SpanRenderer:
                    span["render_slot"] = 0
            token_markup["entities"] = entities
            per_token_info.append(token_markup)
-
+        markup = self._render_markup(per_token_info)
-        return per_token_info
+        markup = TPL_SPANS.format(content=markup, dir=self.direction)
        if title:
            markup = TPL_TITLE.format(title=title) + markup
        return markup
    def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
        """Render the markup from per-token information"""
@ -238,7 +218,7 @@ class SpanRenderer:
                    + (self.offset_step * (len(entities) - 1))
                )
                markup += self.span_template.format(
-                    text=escape_html(token["text"]),
+                    text=token["text"],
                    span_slices=slices,
                    span_starts=starts,
                    total_height=total_height,
@ -334,8 +314,6 @@ class DependencyRenderer:
                self.lang = settings.get("lang", DEFAULT_LANG)
            render_id = f"{id_prefix}-{i}"
            svg = self.render_svg(render_id, p["words"], p["arcs"])
            if p.get("title"):
                svg = TPL_TITLE.format(title=p.get("title")) + svg
            rendered.append(svg)
        if page:
            content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
@ -588,7 +566,7 @@ class EntityRenderer:
            for i, fragment in enumerate(fragments):
                markup += escape_html(fragment)
                if len(fragments) > 1 and i != len(fragments) - 1:
-                    markup += "<br>"
+                    markup += "</br>"
            if self.ents is None or label.upper() in self.ents:
                color = self.colors.get(label.upper(), self.default_color)
                ent_settings = {
@ -606,7 +584,7 @@ class EntityRenderer:
        for i, fragment in enumerate(fragments):
            markup += escape_html(fragment)
            if len(fragments) > 1 and i != len(fragments) - 1:
-                markup += "<br>"
+                markup += "</br>"
        markup = TPL_ENTS.format(content=markup, dir=self.direction)
        if title:
            markup = TPL_TITLE.format(title=title) + markup
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -219,8 +219,6 @@ class Warnings(metaclass=ErrorsWithCodes):
    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
            "key attribute for vectors, configure it through Vectors(attr=) or "
            "'spacy init vectors --attr'")
    W126 = ("These keys are unsupported: {unsupported}")
    W127 = ("Not all `Language.pipe` worker processes completed successfully")
 class Errors(metaclass=ErrorsWithCodes):
@ -228,6 +226,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
            "This usually happens when spaCy calls `nlp.{method}` with a custom "
            "component name that's not registered on the current language class. "
            "If you're using a Transformer, make sure to install 'spacy-transformers'. "
            "If you're using a custom component, make sure you've added the "
            "decorator `@Language.component` (for function components) or "
            "`@Language.factory` (for class components).\n\nAvailable "
@ -554,12 +553,12 @@ class Errors(metaclass=ErrorsWithCodes):
            "during training, make sure to include it in 'annotating components'")
    # New errors added in v3.x
    E849 = ("The vocab only supports {method} for vectors of type "
            "spacy.vectors.Vectors, not {vectors_type}.")
    E850 = ("The PretrainVectors objective currently only supports default or "
            "floret vectors, not {mode} vectors.")
    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
            "but found value of '{val}'.")
    E852 = ("The tar file pulled from the remote attempted an unsafe path "
            "traversal.")
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
            "not permitted in factory names.")
    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
@ -984,10 +983,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "predicted docs when training {component}.")
    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
             "but only callbacks with one or three parameters are supported")
    E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
             "reduction. Please enable one of `use_reduce_first`, "
             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -1,11 +1,3 @@
 from .candidate import Candidate, get_candidates, get_candidates_batch
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
 __all__ = [
    "Candidate",
    "KnowledgeBase",
    "InMemoryLookupKB",
    "get_candidates",
    "get_candidates_batch",
 ]
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -4,8 +4,7 @@ from ..typedefs cimport hash_t
 from .kb cimport KnowledgeBase
-# Object used by the Entity Linker that summarizes one entity-alias candidate
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 # combination.
 cdef class Candidate:
    cdef readonly KnowledgeBase kb
    cdef hash_t entity_hash
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -1,4 +1,4 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
 from typing import Iterable
@ -8,24 +8,15 @@ from ..tokens import Span
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or
+    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
-    may not be resolved to a specific `entity` from a Knowledge Base. This
+    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
-    will be used as input for the entity linking algorithm which will
+    algorithm which will disambiguate the various candidates to the correct one.
    disambiguate the various candidates to the correct one.
    Each candidate (alias, entity) pair is assigned a certain prior probability.
    DOCS: https://spacy.io/api/kb/#candidate-init
    """
-    def __init__(
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
        self,
        KnowledgeBase kb,
        entity_hash,
        entity_freq,
        entity_vector,
        alias_hash,
        prior_prob
    ):
        self.kb = kb
        self.entity_hash = entity_hash
        self.entity_freq = entity_freq
@ -68,8 +59,7 @@ cdef class Candidate:
 def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
    """
-    Return candidate entities for a given mention and fetching appropriate
+    Return candidate entities for a given mention and fetching appropriate entries from the index.
    entries from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Span): Entity mention for which to identify candidates.
    RETURNS (Iterable[Candidate]): Identified candidates.
@ -77,12 +67,9 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
    return kb.get_candidates(mention)
-def get_candidates_batch(
+def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
        kb: KnowledgeBase, mentions: Iterable[Span]
 ) -> Iterable[Iterable[Candidate]]:
    """
-    Return candidate entities for the given mentions and fetching appropriate entries
+    Return candidate entities for the given mentions and fetching appropriate entries from the index.
    from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Iterable[Span]): Entity mentions for which to identify candidates.
    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -1,4 +1,4 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
 from pathlib import Path
 from typing import Iterable, Tuple, Union
@ -12,9 +12,8 @@ from .candidate import Candidate
 cdef class KnowledgeBase:
-    """A `KnowledgeBase` instance stores unique identifiers for entities and
+    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
-    their textual aliases, to support entity linking of named entities to
+    to support entity linking of named entities to real-world concepts.
    real-world concepts.
    This is an abstract class and requires its operations to be implemented.
    DOCS: https://spacy.io/api/kb
@ -32,13 +31,10 @@ cdef class KnowledgeBase:
        self.entity_vector_length = entity_vector_length
        self.mem = Pool()
-    def get_candidates_batch(
+    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
        self, mentions: Iterable[Span]
    ) -> Iterable[Iterable[Candidate]]:
        """
-        Return candidate entities for specified texts. Each candidate defines
+        Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
-        the entity, the original alias, and the prior probability of that
+        and the prior probability of that alias resolving to that entity.
        alias resolving to that entity.
        If no candidate is found for a given text, an empty list is returned.
        mentions (Iterable[Span]): Mentions for which to get candidates.
        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
@ -47,17 +43,14 @@ cdef class KnowledgeBase:
    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
        """
-        Return candidate entities for specified text. Each candidate defines
+        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
        the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
        If the no candidate is found for a given text, an empty list is returned.
        mention (Span): Mention for which to get candidates.
        RETURNS (Iterable[Candidate]): Identified candidates.
        """
        raise NotImplementedError(
-            Errors.E1045.format(
+            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
                parent="KnowledgeBase", method="get_candidates", name=self.__name__
            )
        )
    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
@ -75,9 +68,7 @@ cdef class KnowledgeBase:
        RETURNS (Iterable[float]): Vector for specified entity.
        """
        raise NotImplementedError(
-            Errors.E1045.format(
+            Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
                parent="KnowledgeBase", method="get_vector", name=self.__name__
            )
        )
    def to_bytes(self, **kwargs) -> bytes:
@ -85,9 +76,7 @@ cdef class KnowledgeBase:
        RETURNS (bytes): Current state as binary string.
        """
        raise NotImplementedError(
-            Errors.E1045.format(
+            Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
                parent="KnowledgeBase", method="to_bytes", name=self.__name__
            )
        )
    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
@ -96,35 +85,25 @@ cdef class KnowledgeBase:
        exclude (Tuple[str]): Properties to exclude when restoring KB.
        """
        raise NotImplementedError(
-            Errors.E1045.format(
+            Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
                parent="KnowledgeBase", method="from_bytes", name=self.__name__
            )
        )
-    def to_disk(
+    def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """
        Write KnowledgeBase content to disk.
        path (Union[str, Path]): Target file path.
        exclude (Iterable[str]): List of components to exclude.
        """
        raise NotImplementedError(
-            Errors.E1045.format(
+            Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
                parent="KnowledgeBase", method="to_disk", name=self.__name__
            )
        )
-    def from_disk(
+    def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """
        Load KnowledgeBase content from disk.
        path (Union[str, Path]): Target file path.
        exclude (Iterable[str]): List of components to exclude.
        """
        raise NotImplementedError(
-            Errors.E1045.format(
+            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
                parent="KnowledgeBase", method="from_disk", name=self.__name__
            )
        )
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@ -55,28 +55,23 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    # optional data, we can let users configure a DB as the backend for this.
    cdef object _features_table
    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
        """Add an entity vector to the vectors table."""
        cdef int64_t new_index = self._vectors_table.size()
        self._vectors_table.push_back(entity_vector)
        return new_index
-    cdef inline int64_t c_add_entity(
+
-        self,
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
-        hash_t entity_hash,
+                                     int32_t vector_index, int feats_row) nogil:
        float freq,
        int32_t vector_index,
        int feats_row
    ) nogil:
        """Add an entry to the vector of entries.
-        After calling this method, make sure to update also the _entry_index
+        After calling this method, make sure to update also the _entry_index using the return value"""
        using the return value"""
        # This is what we'll map the entity hash key to. It's where the entry will sit
        # in the vector of entries, so we can get it later.
        cdef int64_t new_index = self._entries.size()
-        # Avoid struct initializer to enable nogil, cf.
+        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
        # https://github.com/cython/cython/issues/1642
        cdef KBEntryC entry
        entry.entity_hash = entity_hash
        entry.vector_index = vector_index
@ -86,17 +81,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        self._entries.push_back(entry)
        return new_index
-    cdef inline int64_t c_add_aliases(
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
-        self,
+        """Connect a mention to a list of potential entities with their prior probabilities .
-        hash_t alias_hash,
+        After calling this method, make sure to update also the _alias_index using the return value"""
-        vector[int64_t] entry_indices,
+        # This is what we'll map the alias hash key to. It's where the alias will be defined
-        vector[float] probs
+        # in the vector of aliases.
    ) nogil:
        """Connect a mention to a list of potential entities with their prior
        probabilities. After calling this method, make sure to update also the
        _alias_index using the return value"""
        # This is what we'll map the alias hash key to. It's where the alias will be
        # defined in the vector of aliases.
        cdef int64_t new_index = self._aliases_table.size()
        # Avoid struct initializer to enable nogil
@ -109,9 +98,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
        """
-        Initializing the vectors and making sure the first element of each vector is a
+        Initializing the vectors and making sure the first element of each vector is a dummy,
-        dummy, because the PreshMap maps pointing to indices in these vectors can not
+        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
        contain 0 as value.
        cf. https://github.com/explosion/preshed/issues/17
        """
        cdef int32_t dummy_value = 0
@ -142,18 +130,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 cdef class Writer:
    cdef FILE* _fp
-    cdef int write_header(
+    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
        self, int64_t nr_entries, int64_t entity_vector_length
    ) except -1
    cdef int write_vector_element(self, float element) except -1
-    cdef int write_entry(
+    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
        self, hash_t entry_hash, float entry_freq, int32_t vector_index
    ) except -1
    cdef int write_alias_length(self, int64_t alias_length) except -1
-    cdef int write_alias_header(
+    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
        self, hash_t alias_hash, int64_t candidate_length
    ) except -1
    cdef int write_alias(self, int64_t entry_index, float prob) except -1
    cdef int _write(self, void* value, size_t size) except -1
@ -161,18 +143,12 @@ cdef class Writer:
 cdef class Reader:
    cdef FILE* _fp
-    cdef int read_header(
+    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
        self, int64_t* nr_entries, int64_t* entity_vector_length
    ) except -1
    cdef int read_vector_element(self, float* element) except -1
-    cdef int read_entry(
+    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
        self, hash_t* entity_hash, float* freq, int32_t* vector_index
    ) except -1
    cdef int read_alias_length(self, int64_t* alias_length) except -1
-    cdef int read_alias_header(
+    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
        self, hash_t* alias_hash, int64_t* candidate_length
    ) except -1
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
    cdef int _read(self, void* value, size_t size) except -1
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -1,5 +1,5 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
-from typing import Any, Callable, Dict, Iterable
+from typing import Any, Callable, Dict, Iterable, Union
 import srsly
@ -27,9 +27,8 @@ from .candidate import Candidate as Candidate
 cdef class InMemoryLookupKB(KnowledgeBase):
-    """An `InMemoryLookupKB` instance stores unique identifiers for entities
+    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
-    and their textual aliases, to support entity linking of named entities to
+    to support entity linking of named entities to real-world concepts.
    real-world concepts.
    DOCS: https://spacy.io/api/inmemorylookupkb
    """
@ -72,8 +71,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    def add_entity(self, str entity, float freq, vector[float] entity_vector):
        """
-        Add an entity to the KB, optionally specifying its log probability
+        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
        based on corpus frequency.
        Return the hash of the entity ID/name at the end.
        """
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
@ -85,20 +83,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        # Raise an error if the provided entity vector is not of the correct length
        if len(entity_vector) != self.entity_vector_length:
-            raise ValueError(
+            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
                Errors.E141.format(
                    found=len(entity_vector), required=self.entity_vector_length
                )
            )
        vector_index = self.c_add_vector(entity_vector=entity_vector)
-        new_index = self.c_add_entity(
+        new_index = self.c_add_entity(entity_hash=entity_hash,
            entity_hash=entity_hash,
                                      freq=freq,
                                      vector_index=vector_index,
-            feats_row=-1
+                                      feats_row=-1)  # Features table currently not implemented
        )  # Features table currently not implemented
        self._entry_index[entity_hash] = new_index
        return entity_hash
@ -123,12 +115,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            else:
                entity_vector = vector_list[i]
                if len(entity_vector) != self.entity_vector_length:
-                    raise ValueError(
+                    raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
                        Errors.E141.format(
                            found=len(entity_vector),
                            required=self.entity_vector_length
                        )
                    )
                entry.entity_hash = entity_hash
                entry.freq = freq_list[i]
@ -162,15 +149,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        previous_alias_nr = self.get_size_aliases()
        # Throw an error if the length of entities and probabilities are not the same
        if not len(entities) == len(probabilities):
-            raise ValueError(
+            raise ValueError(Errors.E132.format(alias=alias,
                Errors.E132.format(
                    alias=alias,
                                                entities_length=len(entities),
-                    probabilities_length=len(probabilities))
+                                                probabilities_length=len(probabilities)))
            )
-        # Throw an error if the probabilities sum up to more than 1 (allow for
+        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
        # some rounding errors)
        prob_sum = sum(probabilities)
        if prob_sum > 1.00001:
            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
@ -187,47 +170,40 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        for entity, prob in zip(entities, probabilities):
            entity_hash = self.vocab.strings[entity]
-            if entity_hash not in self._entry_index:
+            if not entity_hash in self._entry_index:
                raise ValueError(Errors.E134.format(entity=entity))
            entry_index = <int64_t>self._entry_index.get(entity_hash)
            entry_indices.push_back(int(entry_index))
            probs.push_back(float(prob))
-        new_index = self.c_add_aliases(
+        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
            alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
        )
        self._alias_index[alias_hash] = new_index
        if previous_alias_nr + 1 != self.get_size_aliases():
            raise RuntimeError(Errors.E891.format(alias=alias))
        return alias_hash
-    def append_alias(
+    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
        self, str alias, str entity, float prior_prob, ignore_warnings=False
    ):
        """
-        For an alias already existing in the KB, extend its potential entities
+        For an alias already existing in the KB, extend its potential entities with one more.
        with one more.
        Throw a warning if either the alias or the entity is unknown,
        or when the combination is already previously recorded.
        Throw an error if this entity+prior prob would exceed the sum of 1.
-        For efficiency, it's best to use the method `add_alias` as much as
+        For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
        possible instead of this one.
        """
        # Check if the alias exists in the KB
        cdef hash_t alias_hash = self.vocab.strings[alias]
-        if alias_hash not in self._alias_index:
+        if not alias_hash in self._alias_index:
            raise ValueError(Errors.E176.format(alias=alias))
        # Check if the entity exists in the KB
        cdef hash_t entity_hash = self.vocab.strings[entity]
-        if entity_hash not in self._entry_index:
+        if not entity_hash in self._entry_index:
            raise ValueError(Errors.E134.format(entity=entity))
        entry_index = <int64_t>self._entry_index.get(entity_hash)
-        # Throw an error if the prior probabilities (including the new one)
+        # Throw an error if the prior probabilities (including the new one) sum up to more than 1
        # sum up to more than 1
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
        current_sum = sum([p for p in alias_entry.probs])
@ -260,13 +236,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
        """
-        Return candidate entities for an alias. Each candidate defines the
+        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
-        entity, the original alias, and the prior probability of that alias
+        and the prior probability of that alias resolving to that entity.
        resolving to that entity.
        If the alias is not known in the KB, and empty list is returned.
        """
        cdef hash_t alias_hash = self.vocab.strings[alias]
-        if alias_hash not in self._alias_index:
+        if not alias_hash in self._alias_index:
            return []
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
@ -274,14 +249,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        return [Candidate(kb=self,
                          entity_hash=self._entries[entry_index].entity_hash,
                          entity_freq=self._entries[entry_index].freq,
-                          entity_vector=self._vectors_table[
+                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
                              self._entries[entry_index].vector_index
                          ],
                          alias_hash=alias_hash,
                          prior_prob=prior_prob)
-                for (entry_index, prior_prob) in zip(
+                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                    alias_entry.entry_indices, alias_entry.probs
                )
                if entry_index != 0]
    def get_vector(self, str entity):
@ -295,9 +266,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        return self._vectors_table[self._entries[entry_index].vector_index]
    def get_prior_prob(self, str entity, str alias):
-        """ Return the prior probability of a given alias being linked to a
+        """ Return the prior probability of a given alias being linked to a given entity,
-        given entity, or return 0.0 when this combination is not known in the
+        or return 0.0 when this combination is not known in the knowledge base"""
        knowledge base."""
        cdef hash_t alias_hash = self.vocab.strings[alias]
        cdef hash_t entity_hash = self.vocab.strings[entity]
@ -308,9 +278,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        entry_index = self._entry_index[entity_hash]
        alias_entry = self._aliases_table[alias_index]
-        for (entry_index, prior_prob) in zip(
+        for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
            alias_entry.entry_indices, alias_entry.probs
        ):
            if self._entries[entry_index].entity_hash == entity_hash:
                return prior_prob
@ -320,19 +288,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        """Serialize the current state to a binary string.
        """
        def serialize_header():
-            header = (
+            header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
                self.get_size_entities(),
                self.get_size_aliases(),
                self.entity_vector_length
            )
            return srsly.json_dumps(header)
        def serialize_entries():
            i = 1
            tuples = []
-            for entry_hash, entry_index in sorted(
+            for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
                self._entry_index.items(), key=lambda x: x[1]
            ):
                entry = self._entries[entry_index]
                assert entry.entity_hash == entry_hash
                assert entry_index == i
@ -345,9 +307,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            headers = []
            indices_lists = []
            probs_lists = []
-            for alias_hash, alias_index in sorted(
+            for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
                self._alias_index.items(), key=lambda x: x[1]
            ):
                alias = self._aliases_table[alias_index]
                assert alias_index == i
                candidate_length = len(alias.entry_indices)
@ -405,7 +365,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            indices = srsly.json_loads(all_data[1])
            probs = srsly.json_loads(all_data[2])
            for header, indices, probs in zip(headers, indices, probs):
-                alias_hash, _candidate_length = header
+                alias_hash, candidate_length = header
                alias.entry_indices = indices
                alias.probs = probs
                self._aliases_table[i] = alias
@ -454,14 +414,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
                writer.write_vector_element(element)
            i = i+1
-        # dumping the entry records in the order in which they are in the
+        # dumping the entry records in the order in which they are in the _entries vector.
-        # _entries vector.
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
        # index 0 is a dummy object not stored in the _entry_index and can
        # be ignored.
        i = 1
-        for entry_hash, entry_index in sorted(
+        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
            self._entry_index.items(), key=lambda x: x[1]
        ):
            entry = self._entries[entry_index]
            assert entry.entity_hash == entry_hash
            assert entry_index == i
@ -473,9 +429,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        # dumping the aliases in the order in which they are in the _alias_index vector.
        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
        i = 1
-        for alias_hash, alias_index in sorted(
+        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
                self._alias_index.items(), key=lambda x: x[1]
        ):
            alias = self._aliases_table[alias_index]
            assert alias_index == i
@ -581,8 +535,7 @@ cdef class Writer:
    def __init__(self, path):
        assert isinstance(path, Path)
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') \
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
            if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'wb')
        if not self._fp:
            raise IOError(Errors.E146.format(path=path))
@ -592,18 +545,14 @@ cdef class Writer:
        cdef size_t status = fclose(self._fp)
        assert status == 0
-    cdef int write_header(
+    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
        self, int64_t nr_entries, int64_t entity_vector_length
    ) except -1:
        self._write(&nr_entries, sizeof(nr_entries))
        self._write(&entity_vector_length, sizeof(entity_vector_length))
    cdef int write_vector_element(self, float element) except -1:
        self._write(&element, sizeof(element))
-    cdef int write_entry(
+    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
        self, hash_t entry_hash, float entry_freq, int32_t vector_index
    ) except -1:
        self._write(&entry_hash, sizeof(entry_hash))
        self._write(&entry_freq, sizeof(entry_freq))
        self._write(&vector_index, sizeof(vector_index))
@ -612,9 +561,7 @@ cdef class Writer:
    cdef int write_alias_length(self, int64_t alias_length) except -1:
        self._write(&alias_length, sizeof(alias_length))
-    cdef int write_alias_header(
+    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
        self, hash_t alias_hash, int64_t candidate_length
    ) except -1:
        self._write(&alias_hash, sizeof(alias_hash))
        self._write(&candidate_length, sizeof(candidate_length))
@ -630,19 +577,16 @@ cdef class Writer:
 cdef class Reader:
    def __init__(self, path):
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') \
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
            if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'rb')
        if not self._fp:
            PyErr_SetFromErrno(IOError)
-        fseek(self._fp, 0, 0)  # this can be 0 if there is no header
+        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
    def __dealloc__(self):
        fclose(self._fp)
-    cdef int read_header(
+    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
        self, int64_t* nr_entries, int64_t* entity_vector_length
    ) except -1:
        status = self._read(nr_entries, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
@ -662,9 +606,7 @@ cdef class Reader:
                return 0  # end of file
            raise IOError(Errors.E145.format(param="vector element"))
-    cdef int read_entry(
+    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
        self, hash_t* entity_hash, float* freq, int32_t* vector_index
    ) except -1:
        status = self._read(entity_hash, sizeof(hash_t))
        if status < 1:
            if feof(self._fp):
@ -695,9 +637,7 @@ cdef class Reader:
                return 0  # end of file
            raise IOError(Errors.E145.format(param="alias length"))
-    cdef int read_alias_header(
+    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
        self, hash_t* alias_hash, int64_t* candidate_length
    ) except -1:
        status = self._read(alias_hash, sizeof(hash_t))
        if status < 1:
            if feof(self._fp):
--- a/spacy/lang/bo/init.py
+++ b/spacy/lang/bo/init.py
@ -1,16 +0,0 @@
 from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 class TibetanDefaults(BaseDefaults):
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
 class Tibetan(Language):
    lang = "bo"
    Defaults = TibetanDefaults
 __all__ = ["Tibetan"]
--- a/spacy/lang/bo/examples.py
+++ b/spacy/lang/bo/examples.py
@ -1,16 +0,0 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.bo.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
    "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
    "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
    "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
    "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
    "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
 ]
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -1,65 +0,0 @@
 from ...attrs import LIKE_NUM
 # reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
 _num_words = [
    "ཀླད་ཀོར་",
    "གཅིག་",
    "གཉིས་",
    "གསུམ་",
    "བཞི་",
    "ལྔ་",
    "དྲུག་",
    "བདུན་",
    "བརྒྱད་",
    "དགུ་",
    "བཅུ་",
    "བཅུ་གཅིག་",
    "བཅུ་གཉིས་",
    "བཅུ་གསུམ་",
    "བཅུ་བཞི་",
    "བཅུ་ལྔ་",
    "བཅུ་དྲུག་",
    "བཅུ་བདུན་",
    "བཅུ་པརྒྱད",
    "བཅུ་དགུ་",
    "ཉི་ཤུ་",
    "སུམ་ཅུ",
    "བཞི་བཅུ",
    "ལྔ་བཅུ",
    "དྲུག་ཅུ",
    "བདུན་ཅུ",
    "བརྒྱད་ཅུ",
    "དགུ་བཅུ",
    "བརྒྱ་",
    "སྟོང་",
    "ཁྲི་",
    "ས་ཡ་",
    "	བྱེ་བ་",
    "དུང་ཕྱུར་",
    "ཐེར་འབུམ་",
    "ཐེར་འབུམ་ཆེན་པོ་",
    "ཁྲག་ཁྲིག་",
    "ཁྲག་ཁྲིག་ཆེན་པོ་",
 ]
 def like_num(text):
    """
    Check if text resembles a number
    """
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/bo/stop_words.py
+++ b/spacy/lang/bo/stop_words.py
@ -1,198 +0,0 @@
 # Source: https://zenodo.org/records/10148636
 STOP_WORDS = set(
    """
 འི་
 །
 དུ་
 གིས་
 སོགས་
 ཏེ
 གི་
 རྣམས་
 ནི
 ཀུན་
 ཡི་
 འདི
 ཀྱི་
 སྙེད་
 པས་
 གཞན་
 ཀྱིས་
 ཡི
 ལ
 ནི་
 དང་
 སོགས
 ཅིང་
 ར
 དུ
 མི་
 སུ་
 བཅས་
 ཡོངས་
 ལས
 ཙམ་
 གྱིས་
 དེ་
 ཡང་
 མཐའ་དག་
 ཏུ་
 ཉིད་
 ས
 ཏེ་
 གྱི་
 སྤྱི
 དེ
 ཀ་
 ཡིན་
 ཞིང་
 འདི་
 རུང་
 རང་
 ཞིག་
 སྟེ
 སྟེ་
 ན་རེ
 ངམ
 ཤིང་
 དག་
 ཏོ
 རེ་
 འང་
 ཀྱང་
 ལགས་པ
 ཚུ
 དོ
 ཡིན་པ
 རེ
 ན་རེ་
 ཨེ་
 ཚང་མ
 ཐམས་ཅད་
 དམ་
 འོ་
 ཅིག་
 གྱིན་
 ཡིན
 ན
 ཁོ་ན་
 འམ་
 ཀྱིན་
 ལོ
 ཀྱིས
 བས་
 ལགས་
 ཤིག
 གིས
 ཀི་
 སྣ་ཚོགས་
 རྣམས
 སྙེད་པ
 ཡིས་
 གྱི
 གི
 བམ་
 ཤིག་
 རེ་རེ་
 ནམ
 མིན་
 ནམ་
 ངམ་
 རུ་
 འགའ་
 ཀུན
 ཤས་
 ཏུ
 ཡིས
 གིན་
 གམ་
 འོ
 ཡིན་པ་
 མིན
 ལགས
 གྱིས
 ཅང་
 འགའ
 སམ་
 ཞིག
 འང
 ལས་ཆེ་
 འཕྲལ་
 བར་
 རུ
 དང
 ཡ
 འག
 སམ
 ཀ
 ཅུང་ཟད་
 ཅིག
 ཉིད
 དུ་མ
 མ
 ཡིན་བ
 འམ
 མམ
 དམ
 དག
 ཁོ་ན
 ཀྱི
 ལམ
 ཕྱི་
 ནང་
 ཙམ
 ནོ་
 སོ་
 རམ་
 བོ་
 ཨང་
 ཕྱི
 ཏོ་
 ཚོ
 ལ་ལ་
 ཚོ་
 ཅིང
 མ་གི་
 གེ
 གོ
 ཡིན་ལུགས་
 རོ་
 བོ
 ལགས་པ་
 པས
 རབ་
 འི
 རམ
 བས
 གཞན
 སྙེད་པ་
 འབའ་
 མཾ་
 པོ
 ག་
 ག
 གམ
 སྤྱི་
 བམ
 མོ་
 ཙམ་པ་
 ཤ་སྟག་
 མམ་
 རེ་རེ
 སྙེད
 ཏམ་
 ངོ
 གྲང་
 ཏ་རེ
 ཏམ
 ཁ་
 ངེ་
 ཅོག་
 རིལ་
 ཉུང་ཤས་
 གིང་
 ཚ་
 ཀྱང
 """.split()
 )
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@ -6,8 +6,7 @@ _num_words = [
    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
-    "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion",
+    "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
    "septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion"
 ]
 _ordinal_words = [
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
@ -15,8 +14,7 @@ _ordinal_words = [
    "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
    "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
    "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
-    "trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth",
+    "trillionth", "quadrillionth", "gajillionth", "bazillionth"
    "octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth"
 ]
 # fmt: on
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@ -163,7 +163,7 @@ class SpanishLemmatizer(Lemmatizer):
        for old, new in self.lookups.get_table("lemma_rules").get("det", []):
            if word == old:
                return [new]
-        # If none of the specific rules apply, search in the common rules for
+        # If none of the specfic rules apply, search in the common rules for
        # determiners and pronouns that follow a unique pattern for
        # lemmatization. If the word is in the list, return the corresponding
        # lemma.
@ -291,7 +291,7 @@ class SpanishLemmatizer(Lemmatizer):
        for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
            if word == old:
                return [new]
-        # If none of the specific rules apply, search in the common rules for
+        # If none of the specfic rules apply, search in the common rules for
        # determiners and pronouns that follow a unique pattern for
        # lemmatization. If the word is in the list, return the corresponding
        # lemma.
--- a/spacy/lang/fo/init.py
+++ b/spacy/lang/fo/init.py
@ -1,18 +0,0 @@
 from ...language import BaseDefaults, Language
 from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class FaroeseDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    prefixes = TOKENIZER_PREFIXES
 class Faroese(Language):
    lang = "fo"
    Defaults = FaroeseDefaults
 __all__ = ["Faroese"]
--- a/spacy/lang/fo/tokenizer_exceptions.py
+++ b/spacy/lang/fo/tokenizer_exceptions.py
@ -1,90 +0,0 @@
 from ...symbols import ORTH
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 _exc = {}
 for orth in [
    "apr.",
    "aug.",
    "avgr.",
    "árg.",
    "ávís.",
    "beinl.",
    "blkv.",
    "blaðkv.",
    "blm.",
    "blaðm.",
    "bls.",
    "blstj.",
    "blaðstj.",
    "des.",
    "eint.",
    "febr.",
    "fyrrv.",
    "góðk.",
    "h.m.",
    "innt.",
    "jan.",
    "kl.",
    "m.a.",
    "mðr.",
    "mió.",
    "nr.",
    "nto.",
    "nov.",
    "nút.",
    "o.a.",
    "o.a.m.",
    "o.a.tíl.",
    "o.fl.",
    "ff.",
    "o.m.a.",
    "o.o.",
    "o.s.fr.",
    "o.tíl.",
    "o.ø.",
    "okt.",
    "omf.",
    "pst.",
    "ritstj.",
    "sbr.",
    "sms.",
    "smst.",
    "smb.",
    "sb.",
    "sbrt.",
    "sp.",
    "sept.",
    "spf.",
    "spsk.",
    "t.e.",
    "t.s.",
    "t.s.s.",
    "tlf.",
    "tel.",
    "tsk.",
    "t.o.v.",
    "t.d.",
    "uml.",
    "ums.",
    "uppl.",
    "upprfr.",
    "uppr.",
    "útg.",
    "útl.",
    "útr.",
    "vanl.",
    "v.",
    "v.h.",
    "v.ø.o.",
    "viðm.",
    "viðv.",
    "vm.",
    "v.m.",
 ]:
    _exc[orth] = [{ORTH: orth}]
    capitalized = orth.capitalize()
    _exc[capitalized] = [{ORTH: capitalized}]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/gd/init.py
+++ b/spacy/lang/gd/init.py
@ -1,18 +0,0 @@
 from typing import Optional
 from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class ScottishDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    stop_words = STOP_WORDS
 class Scottish(Language):
    lang = "gd"
    Defaults = ScottishDefaults
 __all__ = ["Scottish"]
--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -1,388 +0,0 @@
 STOP_WORDS = set(
    """
 'ad
 'ar
 'd # iad
 'g # ag
 'ga
 'gam
 'gan
 'gar
 'gur
 'm # am
 'n # an
 'n seo
 'na
 'nad
 'nam
 'nan
 'nar
 'nuair
 'nur
 's
 'sa
 'san
 'sann
 'se
 'sna
 a
 a'
 a'd # agad
 a'm # agam
 a-chèile
 a-seo
 a-sin
 a-siud
 a chionn
 a chionn 's
 a chèile
 a chéile
 a dh'
 a h-uile
 a seo
 ac' # aca
 aca
 aca-san
 acasan
 ach
 ag
 agad
 agad-sa
 agads'
 agadsa
 agaibh
 agaibhse
 againn
 againne
 agam
 agam-sa
 agams'
 agamsa
 agus
 aice
 aice-se
 aicese
 aig
 aig' # aige
 aige
 aige-san
 aigesan
 air
 air-san
 air neo
 airsan
 am
 an
 an seo
 an sin
 an siud
 an uair
 ann
 ann a
 ann a'
 ann a shin
 ann am
 ann an
 annad
 annam
 annam-s'
 annamsa
 anns
 anns an
 annta
 aon
 ar
 as
 asad
 asda
 asta
 b'
 bho
 bhon
 bhuaidhe # bhuaithe
 bhuainn
 bhuaipe
 bhuaithe
 bhuapa
 bhur
 brì
 bu
 c'à
 car son
 carson
 cha
 chan
 chionn
 choir
 chon
 chun
 chèile
 chéile
 chòir
 cia mheud
 ciamar
 co-dhiubh
 cuide
 cuin
 cuin'
 cuine
 cà
 cà'
 càil
 càit
 càit'
 càite
 cò
 cò mheud
 có
 d'
 da
 de
 dh'
 dha
 dhaibh
 dhaibh-san
 dhaibhsan
 dhan
 dhasan
 dhe
 dhen
 dheth
 dhi
 dhiom
 dhiot
 dhith
 dhiubh
 dhomh
 dhomh-s'
 dhomhsa
 dhu'sa # dhut-sa
 dhuibh
 dhuibhse
 dhuinn
 dhuinne
 dhuit
 dhut
 dhutsa
 dhut-sa
 dhà
 dhà-san
 dhàsan
 dhòmhsa
 diubh
 do
 docha
 don
 dà
 dè
 dè mar
 dé
 dé mar
 dòch'
 dòcha
 e
 eadar
 eatarra
 eatorra
 eile
 esan
 fa
 far
 feud
 fhad
 fheudar
 fhearr
 fhein
 fheudar
 fheàrr
 fhèin
 fhéin
 fhìn
 fo
 fodha
 fodhainn
 foipe
 fon
 fèin
 ga
 gach
 gam
 gan
 ge brith
 ged
 gu
 gu dè
 gu ruige
 gun
 gur
 gus
 i
 iad
 iadsan
 innte
 is
 ise
 le
 leam
 leam-sa
 leamsa
 leat
 leat-sa
 leatha
 leatsa
 leibh
 leis
 leis-san
 leoth'
 leotha
 leotha-san
 linn
 m'
 m'a
 ma
 mac
 man
 mar
 mas
 mathaid
 mi
 mis'
 mise
 mo
 mu
 mu 'n
 mun
 mur
 mura
 mus
 na
 na b'
 na bu
 na iad
 nach
 nad
 nam
 nan
 nar
 nas
 neo
 no
 nuair
 o
 o'n
 oir
 oirbh
 oirbh-se
 oirnn
 oirnne
 oirre
 on
 orm
 orm-sa
 ormsa
 orra
 orra-san
 orrasan
 ort
 os
 r'
 ri
 ribh
 rinn
 ris
 rithe
 rithe-se
 rium
 rium-sa
 riums'
 riumsa
 riut
 riuth'
 riutha
 riuthasan
 ro
 ro'n
 roimh
 roimhe
 romhainn
 romham
 romhpa
 ron
 ruibh
 ruinn
 ruinne
 sa
 san
 sann
 se
 seach
 seo
 seothach
 shin
 sibh
 sibh-se
 sibhse
 sin
 sineach
 sinn
 sinne
 siod
 siodach
 siud
 siudach
 sna # ann an
 sè
 t'
 tarsaing
 tarsainn
 tarsuinn
 thar
 thoigh
 thro
 thu
 thuc'
 thuca
 thugad
 thugaibh
 thugainn
 thugam
 thugamsa
 thuice
 thuige
 thus'
 thusa
 timcheall
 toigh
 toil
 tro
 tro' # troimh
 troimh
 troimhe
 tron
 tu
 tusa
 uair
 ud
 ugaibh
 ugam-s'
 ugam-sa
 uice
 uige
 uige-san
 umad
 unnta # ann an
 ur
 urrainn
 à
 às
 àsan
 á
 ás
 è
 ì
 ò
 ó
 """.split(
        "\n"
    )
 )
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@ -15,7 +15,6 @@ _prefixes = (
    [
        "†",
        "⸏",
        "〈",
    ]
    + LIST_PUNCT
    + LIST_ELLIPSES
@ -32,7 +31,6 @@ _suffixes = (
    + [
        "†",
        "⸎",
        "〉",
        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
    ]
 )
--- a/spacy/lang/hr/lemma_lookup_license.txt
+++ b/spacy/lang/hr/lemma_lookup_license.txt
@ -1,5 +1,5 @@
 The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
-Reldi-tagger is licensed under the Apache 2.0 licence.
+Reldi-tagger is licesned under the Apache 2.0 licence.
@InProceedings{ljubesic16-new,
  author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
--- a/spacy/lang/ht/init.py
+++ b/spacy/lang/ht/init.py
@ -1,52 +0,0 @@
 from typing import Callable, Optional
 from thinc.api import Model
 from ...language import BaseDefaults, Language
 from .lemmatizer import HaitianCreoleLemmatizer
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 class HaitianCreoleDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
 class HaitianCreole(Language):
    lang = "ht"
    Defaults = HaitianCreoleDefaults
@HaitianCreole.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
    return HaitianCreoleLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["HaitianCreole"]
--- a/spacy/lang/ht/examples.py
+++ b/spacy/lang/ht/examples.py
@ -1,18 +0,0 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.ht.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
    "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
    "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
    "Lond se yon gwo vil nan Wayòm Ini",
    "Kote ou ye?",
    "Kilès ki prezidan Lafrans?",
    "Ki kapital Etazini?",
    "Kile Barack Obama te fèt?",
 ]
--- a/spacy/lang/ht/lemmatizer.py
+++ b/spacy/lang/ht/lemmatizer.py
@ -1,51 +0,0 @@
 from typing import List, Tuple
 from ...pipeline import Lemmatizer
 from ...tokens import Token
 from ...lookups import Lookups
 class HaitianCreoleLemmatizer(Lemmatizer):
    """
    Minimal Haitian Creole lemmatizer.
    Returns a word's base form based on rules and lookup,
    or defaults to the original form.
    """
    def is_base_form(self, token: Token) -> bool:
        morph = token.morph.to_dict()
        upos = token.pos_.lower()
        # Consider unmarked forms to be base
        if upos in {"noun", "verb", "adj", "adv"}:
            if not morph:
                return True
            if upos == "noun" and morph.get("Number") == "Sing":
                return True
            if upos == "verb" and morph.get("VerbForm") == "Inf":
                return True
            if upos == "adj" and morph.get("Degree") == "Pos":
                return True
        return False
    def rule_lemmatize(self, token: Token) -> List[str]:
        string = token.text.lower()
        pos = token.pos_.lower()
        cache_key = (token.orth, token.pos)
        if cache_key in self.cache:
            return self.cache[cache_key]
        forms = []
        # fallback rule: just return lowercased form
        forms.append(string)
        self.cache[cache_key] = forms
        return forms
    @classmethod
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        if mode == "rule":
            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
            return (required, [])
        return super().get_lookups_config(mode)
--- a/spacy/lang/ht/lex_attrs.py
+++ b/spacy/lang/ht/lex_attrs.py
@ -1,78 +0,0 @@
 from ...attrs import LIKE_NUM, NORM
 # Cardinal numbers in Creole
 _num_words = set(
    """
 zewo youn en de twa kat senk sis sèt uit nèf dis
 onz douz trèz katoz kenz sèz disèt dizwit diznèf
 vent trant karant sinkant swasant swasann-dis
 san mil milyon milya
 """.split()
 )
 # Ordinal numbers in Creole (some are French-influenced, some simplified)
 _ordinal_words = set(
    """
 premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
 onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
 ventyèm trantyèm karantyèm sinkantyèm swasantyèm
 swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
 """.split()
 )
 NORM_MAP = {
    "'m": "mwen",
    "'w": "ou",
    "'l": "li",
    "'n": "nou",
    "'y": "yo",
    "’m": "mwen",
    "’w": "ou",
    "’l": "li",
    "’n": "nou",
    "’y": "yo",
    "m": "mwen",
    "n": "nou",
    "l": "li",
    "y": "yo",
    "w": "ou",
    "t": "te",
    "k": "ki",
    "p": "pa",
    "M": "Mwen",
    "N": "Nou",
    "L": "Li",
    "Y": "Yo",
    "W": "Ou",
    "T": "Te",
    "K": "Ki",
    "P": "Pa",
 }
 def like_num(text):
    text = text.strip().lower()
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    if text in _ordinal_words:
        return True
    # Handle things like "3yèm", "10yèm", "25yèm", etc.
    if text.endswith("yèm") and text[:-3].isdigit():
        return True
    return False
 def norm_custom(text):
    return NORM_MAP.get(text, text.lower())
 LEX_ATTRS = {
    LIKE_NUM: like_num,
    NORM: norm_custom,
 }
--- a/spacy/lang/ht/punctuation.py
+++ b/spacy/lang/ht/punctuation.py
@ -1,43 +0,0 @@
 from ..char_classes import (
    ALPHA,
    ALPHA_LOWER,
    ALPHA_UPPER,
    CONCAT_QUOTES,
    HYPHENS,
    LIST_PUNCT,
    LIST_QUOTES,
    LIST_ELLIPSES,
    LIST_ICONS,
    merge_chars,
 )
 ELISION = "'’".replace(" ", "")
 _prefixes_elision = "m n l y t k w"
 _prefixes_elision += " " + _prefixes_elision.upper()
 TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
    r"(?:({pe})[{el}])(?=[{a}])".format(
        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
    )
 ]
 TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
    r"(?<=[0-9])%",  # numbers like 10%
    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
 ]
 TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
    r"(?<=[0-9])[+\-\*^](?=[0-9-])",
    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
    ),
    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
 ]
--- a/spacy/lang/ht/stop_words.py
+++ b/spacy/lang/ht/stop_words.py
@ -1,50 +0,0 @@
 STOP_WORDS = set(
    """
 a ak an ankò ant apre ap atò avan avanlè
 byen bò byenke
 chak
 de depi deja deja
 e en epi èske
 fò fòk
 gen genyen
 ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
 la l laa le lè li lye lò
 m m' mwen
 nan nap nou n'
 ou oumenm
 pa paske pami pandan pito pou pral preske pwiske
 se selman si sou sòt
 ta tap tankou te toujou tou tan tout toutotan twòp tèl
 w w' wi wè
 y y' yo yon yonn
 non o oh eh
 sa san si swa si
 men mèsi oswa osinon
 """
 .split()
 )
 # Add common contractions, with and without apostrophe variants
 contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
 for apostrophe in ["'", "’", "‘"]:
    for word in contractions:
        STOP_WORDS.add(word.replace("'", apostrophe))
--- a/spacy/lang/ht/syntax_iterators.py
+++ b/spacy/lang/ht/syntax_iterators.py
@ -1,74 +0,0 @@
 from typing import Iterator, Tuple, Union
 from ...errors import Errors
 from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    """
    Detect base noun phrases from a dependency parse for Haitian Creole.
    Works on both Doc and Span objects.
    """
    # Core nominal dependencies common in Haitian Creole
    labels = [
        "nsubj",
        "obj",
        "obl",
        "nmod",
        "appos",
        "ROOT",
    ]
    # Modifiers to optionally include in chunk (to the right)
    post_modifiers = ["compound", "flat", "flat:name", "fixed"]
    doc = doclike.doc
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = {doc.vocab.strings.add(label) for label in labels}
    np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
    conj_label = doc.vocab.strings.add("conj")
    np_label = doc.vocab.strings.add("NP")
    adp_pos = doc.vocab.strings.add("ADP")
    cc_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
            right_end = word
            # expand to include known modifiers to the right
            for child in word.rights:
                if child.dep in np_mods:
                    right_end = child.right_edge
                elif child.pos == NOUN:
                    right_end = child.right_edge
            left_index = word.left_edge.i
            # Skip prepositions at the start
            if word.left_edge.pos == adp_pos:
                left_index += 1
            prev_end = right_end.i
            yield left_index, right_end.i + 1, np_label
        elif word.dep == conj_label:
            head = word.head
            while head.dep == conj_label and head.head.i < head.i:
                head = head.head
            if head.dep in np_deps:
                left_index = word.left_edge.i
                if word.left_edge.pos == cc_pos:
                    left_index += 1
                prev_end = word.i
                yield left_index, word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/ht/tag_map.py
+++ b/spacy/lang/ht/tag_map.py
@ -1,21 +0,0 @@
 from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
 TAG_MAP = {
    "NOUN": {"pos": NOUN},
    "VERB": {"pos": VERB},
    "AUX": {"pos": AUX},
    "ADJ": {"pos": ADJ},
    "ADV": {"pos": ADV},
    "PRON": {"pos": PRON},
    "DET": {"pos": DET},
    "ADP": {"pos": ADP},
    "SCONJ": {"pos": SCONJ},
    "CCONJ": {"pos": CCONJ},
    "PART": {"pos": PART},
    "INTJ": {"pos": INTJ},
    "NUM": {"pos": NUM},
    "PROPN": {"pos": PROPN},
    "PUNCT": {"pos": PUNCT},
    "SYM": {"pos": SYM},
    "X": {"pos": X},
 }
--- a/spacy/lang/ht/tokenizer_exceptions.py
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@ -1,121 +0,0 @@
 from spacy.symbols import ORTH, NORM
 def make_variants(base, first_norm, second_orth, second_norm):
    return {
        base: [
            {ORTH: base.split("'")[0] + "'", NORM: first_norm},
            {ORTH: second_orth, NORM: second_norm},
        ],
        base.capitalize(): [
            {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
            {ORTH: second_orth, NORM: second_norm},
        ]
    }
 TOKENIZER_EXCEPTIONS = {
    "Dr.": [{ORTH: "Dr."}]
 }
 # Apostrophe forms
 TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
 TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
 TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
 # Non-apostrophe contractions (with capitalized variants)
 TOKENIZER_EXCEPTIONS.update({
    "map": [
        {ORTH: "m", NORM: "mwen"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Map": [
        {ORTH: "M", NORM: "Mwen"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "lem": [
        {ORTH: "le", NORM: "le"},
        {ORTH: "m", NORM: "mwen"},
    ],
    "Lem": [
        {ORTH: "Le", NORM: "Le"},
        {ORTH: "m", NORM: "mwen"},
    ],
    "lew": [
        {ORTH: "le", NORM: "le"},
        {ORTH: "w", NORM: "ou"},
    ],
    "Lew": [
        {ORTH: "Le", NORM: "Le"},
        {ORTH: "w", NORM: "ou"},
    ],
    "nap": [
        {ORTH: "n", NORM: "nou"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Nap": [
        {ORTH: "N", NORM: "Nou"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "lap": [
        {ORTH: "l", NORM: "li"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Lap": [
        {ORTH: "L", NORM: "Li"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "yap": [
        {ORTH: "y", NORM: "yo"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Yap": [
        {ORTH: "Y", NORM: "Yo"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "mte": [
        {ORTH: "m", NORM: "mwen"},
        {ORTH: "te", NORM: "te"},
    ],
    "Mte": [
        {ORTH: "M", NORM: "Mwen"},
        {ORTH: "te", NORM: "te"},
    ],
    "mpral": [
        {ORTH: "m", NORM: "mwen"},
        {ORTH: "pral", NORM: "pral"},
    ],
    "Mpral": [
        {ORTH: "M", NORM: "Mwen"},
        {ORTH: "pral", NORM: "pral"},
    ],
    "wap": [
        {ORTH: "w", NORM: "ou"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Wap": [
        {ORTH: "W", NORM: "Ou"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "kap": [
        {ORTH: "k", NORM: "ki"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Kap": [
        {ORTH: "K", NORM: "Ki"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "tap": [
        {ORTH: "t", NORM: "te"},
        {ORTH: "ap", NORM: "ap"},
    ],
    "Tap": [
        {ORTH: "T", NORM: "Te"},
        {ORTH: "ap", NORM: "ap"},
    ],
 })
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -32,6 +32,7 @@ split_mode = null
 """
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
 def create_tokenizer(split_mode: Optional[str] = None):
    def japanese_tokenizer_factory(nlp):
        return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -1,16 +0,0 @@
 from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 class KurmanjiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
 class Kurmanji(Language):
    lang = "kmr"
    Defaults = KurmanjiDefaults
 __all__ = ["Kurmanji"]
--- a/spacy/lang/kmr/examples.py
+++ b/spacy/lang/kmr/examples.py
@ -1,17 +0,0 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.kmr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
 ]
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -1,138 +0,0 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "sifir",
    "yek",
    "du",
    "sê",
    "çar",
    "pênc",
    "şeş",
    "heft",
    "heşt",
    "neh",
    "deh",
    "yazde",
    "dazde",
    "sêzde",
    "çarde",
    "pazde",
    "şazde",
    "hevde",
    "hejde",
    "nozde",
    "bîst",
    "sî",
    "çil",
    "pêncî",
    "şêst",
    "heftê",
    "heştê",
    "nod",
    "sed",
    "hezar",
    "milyon",
    "milyar",
 ]
 _ordinal_words = [
    "yekem",
    "yekemîn",
    "duyem",
    "duyemîn",
    "sêyem",
    "sêyemîn",
    "çarem",
    "çaremîn",
    "pêncem",
    "pêncemîn",
    "şeşem",
    "şeşemîn",
    "heftem",
    "heftemîn",
    "heştem",
    "heştemîn",
    "nehem",
    "nehemîn",
    "dehem",
    "dehemîn",
    "yazdehem",
    "yazdehemîn",
    "dazdehem",
    "dazdehemîn",
    "sêzdehem",
    "sêzdehemîn",
    "çardehem",
    "çardehemîn",
    "pazdehem",
    "pazdehemîn",
    "şanzdehem",
    "şanzdehemîn",
    "hevdehem",
    "hevdehemîn",
    "hejdehem",
    "hejdehemîn",
    "nozdehem",
    "nozdehemîn",
    "bîstem",
    "bîstemîn",
    "sîyem",
    "sîyemîn",
    "çilem",
    "çilemîn",
    "pêncîyem",
    "pênciyemîn",
    "şêstem",
    "şêstemîn",
    "heftêyem",
    "heftêyemîn",
    "heştêyem",
    "heştêyemîn",
    "notem",
    "notemîn",
    "sedem",
    "sedemîn",
    "hezarem",
    "hezaremîn",
    "milyonem",
    "milyonemîn",
    "milyarem",
    "milyaremîn",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    if is_digit(text_lower):
        return True
    return False
 def is_digit(text):
    endings = ("em", "yem", "emîn", "yemîn")
    for ending in endings:
        to = len(ending)
        if text.endswith(ending) and text[:-to].isdigit():
            return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/kmr/stop_words.py
+++ b/spacy/lang/kmr/stop_words.py
@ -1,44 +0,0 @@
 STOP_WORDS = set(
    """
 û
 li
 bi
 di
 da
 de
 ji
 ku
 ew
 ez
 tu
 em
 hûn
 ew
 ev
 min
 te
 wî
 wê
 me
 we
 wan
 vê
 vî
 va
 çi
 kî
 kê
 çawa
 çima
 kengî
 li ku
 çend
 çiqas
 her
 hin
 gelek
 hemû
 kes
 tişt
 """.split()
 )
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -20,6 +20,7 @@ DEFAULT_CONFIG = """
 """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
 def create_tokenizer():
    def korean_tokenizer_factory(nlp):
        return KoreanTokenizer(nlp.vocab)
--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -24,6 +24,12 @@ class MacedonianDefaults(BaseDefaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
        if lookups is None:
            lookups = Lookups()
        return MacedonianLemmatizer(lookups)
 class Macedonian(Language):
    lang = "mk"
--- a/spacy/lang/nn/init.py
+++ b/spacy/lang/nn/init.py
@ -1,20 +0,0 @@
 from ...language import BaseDefaults, Language
 from ..nb import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class NorwegianNynorskDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    syntax_iterators = SYNTAX_ITERATORS
 class NorwegianNynorsk(Language):
    lang = "nn"
    Defaults = NorwegianNynorskDefaults
 __all__ = ["NorwegianNynorsk"]
--- a/spacy/lang/nn/examples.py
+++ b/spacy/lang/nn/examples.py
@ -1,15 +0,0 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.nn.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
 sentences = [
    "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
    "Det er ein meir enn i same periode i fjor.",
    "Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
    "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
 ]
--- a/spacy/lang/nn/punctuation.py
+++ b/spacy/lang/nn/punctuation.py
@ -1,74 +0,0 @@
 from ..char_classes import (
    ALPHA,
    ALPHA_LOWER,
    ALPHA_UPPER,
    CONCAT_QUOTES,
    CURRENCY,
    LIST_CURRENCY,
    LIST_ELLIPSES,
    LIST_ICONS,
    LIST_PUNCT,
    LIST_QUOTES,
    PUNCT,
    UNITS,
 )
 from ..punctuation import TOKENIZER_SUFFIXES
 _quotes = CONCAT_QUOTES.replace("'", "")
 _list_punct = [x for x in LIST_PUNCT if x != "#"]
 _list_icons = [x for x in LIST_ICONS if x != "°"]
 _list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
 _list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
 _prefixes = (
    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
    + _list_punct
    + LIST_ELLIPSES
    + LIST_QUOTES
    + LIST_CURRENCY
    + LIST_ICONS
 )
 _infixes = (
    LIST_ELLIPSES
    + _list_icons
    + [
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
    ]
 )
 _suffixes = (
    LIST_PUNCT
    + LIST_ELLIPSES
    + _list_quotes
    + _list_icons
    + ["—", "–"]
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
        r"(?<=[{al}{e}{p}(?:{q})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
    + [r"(?<=[^sSxXzZ])'"]
 )
 _suffixes += [
    suffix
    for suffix in TOKENIZER_SUFFIXES
    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
 ]
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/lang/nn/tokenizer_exceptions.py
+++ b/spacy/lang/nn/tokenizer_exceptions.py
@ -1,228 +0,0 @@
 from ...symbols import NORM, ORTH
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 _exc = {}
 for exc_data in [
    {ORTH: "jan.", NORM: "januar"},
    {ORTH: "feb.", NORM: "februar"},
    {ORTH: "mar.", NORM: "mars"},
    {ORTH: "apr.", NORM: "april"},
    {ORTH: "jun.", NORM: "juni"},
    # note: "jul." is in the simple list below without a NORM exception
    {ORTH: "aug.", NORM: "august"},
    {ORTH: "sep.", NORM: "september"},
    {ORTH: "okt.", NORM: "oktober"},
    {ORTH: "nov.", NORM: "november"},
    {ORTH: "des.", NORM: "desember"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
    "Ap.",
    "Aq.",
    "Ca.",
    "Chr.",
    "Co.",
    "Dr.",
    "F.eks.",
    "Fr.p.",
    "Frp.",
    "Grl.",
    "Kr.",
    "Kr.F.",
    "Kr.F.s",
    "Mr.",
    "Mrs.",
    "Pb.",
    "Pr.",
    "Sp.",
    "St.",
    "a.m.",
    "ad.",
    "adm.dir.",
    "adr.",
    "b.c.",
    "bl.a.",
    "bla.",
    "bm.",
    "bnr.",
    "bto.",
    "c.c.",
    "ca.",
    "cand.mag.",
    "co.",
    "d.d.",
    "d.m.",
    "d.y.",
    "dept.",
    "dr.",
    "dr.med.",
    "dr.philos.",
    "dr.psychol.",
    "dss.",
    "dvs.",
    "e.Kr.",
    "e.l.",
    "eg.",
    "eig.",
    "ekskl.",
    "el.",
    "et.",
    "etc.",
    "etg.",
    "ev.",
    "evt.",
    "f.",
    "f.Kr.",
    "f.eks.",
    "f.o.m.",
    "fhv.",
    "fk.",
    "foreg.",
    "fork.",
    "fv.",
    "fvt.",
    "g.",
    "gl.",
    "gno.",
    "gnr.",
    "grl.",
    "gt.",
    "h.r.adv.",
    "hhv.",
    "hoh.",
    "hr.",
    "ifb.",
    "ifm.",
    "iht.",
    "inkl.",
    "istf.",
    "jf.",
    "jr.",
    "jul.",
    "juris.",
    "kfr.",
    "kgl.",
    "kgl.res.",
    "kl.",
    "komm.",
    "kr.",
    "kst.",
    "lat.",
    "lø.",
    "m.a.",
    "m.a.o.",
    "m.fl.",
    "m.m.",
    "m.v.",
    "ma.",
    "mag.art.",
    "md.",
    "mfl.",
    "mht.",
    "mill.",
    "min.",
    "mnd.",
    "moh.",
    "mrd.",
    "muh.",
    "mv.",
    "mva.",
    "n.å.",
    "ndf.",
    "nr.",
    "nto.",
    "nyno.",
    "o.a.",
    "o.l.",
    "obl.",
    "off.",
    "ofl.",
    "on.",
    "op.",
    "org.",
    "osv.",
    "ovf.",
    "p.",
    "p.a.",
    "p.g.a.",
    "p.m.",
    "p.t.",
    "pga.",
    "ph.d.",
    "pkt.",
    "pr.",
    "pst.",
    "pt.",
    "red.anm.",
    "ref.",
    "res.",
    "res.kap.",
    "resp.",
    "rv.",
    "s.",
    "s.d.",
    "s.k.",
    "s.u.",
    "s.å.",
    "sen.",
    "sep.",
    "siviling.",
    "sms.",
    "snr.",
    "spm.",
    "sr.",
    "sst.",
    "st.",
    "st.meld.",
    "st.prp.",
    "stip.",
    "stk.",
    "stud.",
    "sv.",
    "såk.",
    "sø.",
    "t.d.",
    "t.h.",
    "t.o.m.",
    "t.v.",
    "temp.",
    "ti.",
    "tils.",
    "tilsv.",
    "tl;dr",
    "tlf.",
    "to.",
    "ult.",
    "utg.",
    "v.",
    "vedk.",
    "vedr.",
    "vg.",
    "vgs.",
    "vha.",
    "vit.ass.",
    "vn.",
    "vol.",
    "vs.",
    "vsa.",
    "§§",
    "©NTB",
    "årg.",
    "årh.",
 ]:
    _exc[orth] = [{ORTH: orth}]
 # Dates
 for h in range(1, 31 + 1):
    for period in ["."]:
        _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
 _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
 _exc.update(_custom_base_exc)
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -13,6 +13,7 @@ DEFAULT_CONFIG = """
 """
@registry.tokenizers("spacy.th.ThaiTokenizer")
 def create_thai_tokenizer():
    def thai_tokenizer_factory(nlp):
        return ThaiTokenizer(nlp.vocab)
--- a/spacy/lang/tr/examples.py
+++ b/spacy/lang/tr/examples.py
@ -15,7 +15,4 @@ sentences = [
    "Türkiye'nin başkenti neresi?",
    "Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
    "Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
    "Cemal Sureya kimdir?",
    "Bunlari Biliyor muydunuz?",
    "Altinoluk Turkiye haritasinin neresinde yer alir?",
 ]
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -22,6 +22,7 @@ use_pyvi = true
 """
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
 def create_vietnamese_tokenizer(use_pyvi: bool = True):
    def vietnamese_tokenizer_factory(nlp):
        return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -46,6 +46,7 @@ class Segmenter(str, Enum):
        return list(cls.__members__.keys())
@registry.tokenizers("spacy.zh.ChineseTokenizer")
 def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
    def chinese_tokenizer_factory(nlp):
        return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -5,7 +5,7 @@ import multiprocessing as mp
 import random
 import traceback
 import warnings
-from contextlib import ExitStack, contextmanager
+from contextlib import contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from itertools import chain, cycle
@ -30,11 +30,8 @@ from typing import (
    overload,
 )
 import numpy
 import srsly
 from cymem.cymem import Pool
 from thinc.api import Config, CupyOps, Optimizer, get_current_ops
 from thinc.util import convert_recursive
 from . import about, ty, util
 from .compat import Literal
@ -68,7 +65,6 @@ from .util import (
    registry,
    warn_if_jupyter_cupy,
 )
 from .vectors import BaseVectors
 from .vocab import Vocab, create_vocab
 PipeCallable = Callable[[Doc], Doc]
@ -104,6 +100,7 @@ class BaseDefaults:
    writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@registry.tokenizers("spacy.Tokenizer.v1")
 def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    """Registered function to create a tokenizer. Returns a factory that takes
    the nlp object and returns a Tokenizer instance using the language detaults.
@ -129,6 +126,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
    lookups = load_lookups(lang=lang, tables=tables)
@ -141,7 +139,7 @@ class Language:
    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
+    lang (str): IETF language code, such as 'en'.
    DOCS: https://spacy.io/api/language
    """
@ -160,7 +158,6 @@ class Language:
        max_length: int = 10**6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
        batch_size: int = 1000,
        **kwargs,
    ) -> None:
@ -183,9 +180,6 @@ class Language:
        DOCS: https://spacy.io/api/language#init
        """
        from .pipeline.factories import register_factories
        register_factories()
        # We're only calling this to import all factories provided via entry
        # points. The factory decorator applied to these functions takes care
        # of the rest.
@ -204,10 +198,6 @@ class Language:
        if vocab is True:
            vectors_name = meta.get("vectors", {}).get("name")
            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
            if not create_vectors:
                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
                create_vectors = registry.resolve(vectors_cfg)["vectors"]
            vocab.vectors = create_vectors(vocab)
        else:
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -1215,7 +1205,7 @@ class Language:
                    examples,
                ):
                    eg.predicted = doc
-        return _replace_numpy_floats(losses)
+        return losses
    def rehearse(
        self,
@ -1466,7 +1456,7 @@ class Language:
        results = scorer.score(examples, per_component=per_component)
        n_words = sum(len(eg.predicted) for eg in examples)
        results["speed"] = n_words / (end_time - start_time)
-        return _replace_numpy_floats(results)
+        return results
    def create_optimizer(self):
        """Create an optimizer, usually using the [training.optimizer] config."""
@ -1687,12 +1677,6 @@ class Language:
        for proc in procs:
            proc.start()
        # Close writing-end of channels. This is needed to avoid that reading
        # from the channel blocks indefinitely when the worker closes the
        # channel.
        for tx in bytedocs_send_ch:
            tx.close()
        # Cycle channels not to break the order of docs.
        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
        byte_tuples = chain.from_iterable(
@ -1715,27 +1699,8 @@ class Language:
                    # tell `sender` that one batch was consumed.
                    sender.step()
        finally:
            # If we are stopping in an orderly fashion, the workers' queues
            # are empty. Put the sentinel in their queues to signal that work
            # is done, so that they can exit gracefully.
            for q in texts_q:
                q.put(_WORK_DONE_SENTINEL)
                q.close()
            # Otherwise, we are stopping because the error handler raised an
            # exception. The sentinel will be last to go out of the queue.
            # To avoid doing unnecessary work or hanging on platforms that
            # block on sending (Windows), we'll close our end of the channel.
            # This signals to the worker that it can exit the next time it
            # attempts to send data down the channel.
            for r in bytedocs_recv_ch:
                r.close()
            for proc in procs:
-                proc.join()
+                proc.terminate()
            if not all(proc.exitcode == 0 for proc in procs):
                warnings.warn(Warnings.W127)
    def _link_components(self) -> None:
        """Register 'listeners' within pipeline components, to allow them to
@ -1800,10 +1765,6 @@ class Language:
            ).merge(config)
        if "nlp" not in config:
            raise ValueError(Errors.E985.format(config=config))
        # fill in [nlp.vectors] if not present (as a narrower alternative to
        # auto-filling [nlp] from the default config)
        if "vectors" not in config["nlp"]:
            config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
        config_lang = config["nlp"].get("lang")
        if config_lang is not None and config_lang != cls.lang:
            raise ValueError(
@ -1835,7 +1796,6 @@ class Language:
            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
        )
        create_tokenizer = resolved_nlp["tokenizer"]
        create_vectors = resolved_nlp["vectors"]
        before_creation = resolved_nlp["before_creation"]
        after_creation = resolved_nlp["after_creation"]
        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
@ -1856,12 +1816,7 @@ class Language:
        # inside stuff like the spacy train function. If we loaded them here,
        # then we would load them twice at runtime: once when we make from config,
        # and then again when we load from disk.
-        nlp = lang_cls(
+        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
            vocab=vocab,
            create_tokenizer=create_tokenizer,
            create_vectors=create_vectors,
            meta=meta,
        )
        if after_creation is not None:
            nlp = after_creation(nlp)
            if not isinstance(nlp, cls):
@ -1871,6 +1826,7 @@ class Language:
        # Later we replace the component config with the raw config again.
        interpolated = filled.interpolate() if not filled.is_interpolated else filled
        pipeline = interpolated.get("components", {})
        sourced = util.get_sourced_components(interpolated)
        # If components are loaded from a source (existing models), we cache
        # them here so they're only loaded once
        source_nlps = {}
@ -2003,7 +1959,7 @@ class Language:
        useful when training a pipeline with components sourced from an existing
        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
        the same tok2vec component, but some of them are frozen and not updated,
-        their performance may degrade significantly as the tok2vec component is
+        their performance may degrade significally as the tok2vec component is
        updated with new data. To prevent this, listeners can be replaced with
        a standalone tok2vec layer that is owned by the component and doesn't
        change if the component isn't updated.
@ -2095,38 +2051,6 @@ class Language:
                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
                tok2vec.remove_listener(listener, pipe_name)
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
        """Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        Example
        -------
        >>> with nlp.memory_zone():
        ...     for doc in nlp.pipe(texts):
        ...        process_my_doc(doc)
        >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
        """
        if mem is None:
            mem = Pool()
        # The ExitStack allows programmatic nested context managers.
        # We don't know how many we need, so it would be awkward to have
        # them as nested blocks.
        with ExitStack() as stack:
            contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
            if hasattr(self.tokenizer, "memory_zone"):
                contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
            for _, pipe in self.pipeline:
                if hasattr(pipe, "memory_zone"):
                    contexts.append(stack.enter_context(pipe.memory_zone(mem)))
            yield mem
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
@ -2144,9 +2068,7 @@ class Language:
        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(  # type: ignore[union-attr]
            p, exclude=["vocab"]
        )
-        serializers["meta.json"] = lambda p: srsly.write_json(
+        serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
            p, _replace_numpy_floats(self.meta)
        )
        serializers["config.cfg"] = lambda p: self.config.to_disk(p)
        for name, proc in self._components:
            if name in exclude:
@ -2260,9 +2182,7 @@ class Language:
        serializers: Dict[str, Callable[[], bytes]] = {}
        serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])  # type: ignore[union-attr]
-        serializers["meta.json"] = lambda: srsly.json_dumps(
+        serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
            _replace_numpy_floats(self.meta)
        )
        serializers["config.cfg"] = lambda: self.config.to_bytes()
        for name, proc in self._components:
            if name in exclude:
@ -2313,12 +2233,6 @@ class Language:
        return self
 def _replace_numpy_floats(meta_dict: dict) -> dict:
    return convert_recursive(
        lambda v: isinstance(v, numpy.floating), lambda v: float(v), dict(meta_dict)
    )
@dataclass
 class FactoryMeta:
    """Dataclass containing information about a component and its defaults
@ -2394,13 +2308,6 @@ def _apply_pipes(
    while True:
        try:
            texts_with_ctx = receiver.get()
            # Stop working if we encounter the end-of-work sentinel.
            if isinstance(texts_with_ctx, _WorkDoneSentinel):
                sender.close()
                receiver.close()
                return
            docs = (
                ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
            )
@ -2409,23 +2316,11 @@ def _apply_pipes(
            # Connection does not accept unpickable objects, so send list.
            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
            padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
-            data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = (
+            sender.send(byte_docs + padding)  # type: ignore[operator]
                byte_docs + padding  # type: ignore[operator]
            )
        except Exception:
            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
            padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
-            data = error_msg + padding
+            sender.send(error_msg + padding)
        try:
            sender.send(data)
        except BrokenPipeError:
            # Parent has closed the pipe prematurely. This happens when a
            # worker encounters an error and the error handler is set to
            # stop processing.
            sender.close()
            receiver.close()
            return
 class _Sender:
@ -2455,10 +2350,3 @@ class _Sender:
        if self.count >= self.chunk_size:
            self.count = 0
            self.send()
 class _WorkDoneSentinel:
    pass
 _WORK_DONE_SENTINEL = _WorkDoneSentinel()
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -35,7 +35,7 @@ cdef class Lexeme:
        return self
    @staticmethod
-    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) noexcept nogil:
+    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
        if name < (sizeof(flags_t) * 8):
            Lexeme.c_set_flag(lex, name, value)
        elif name == ID:
@ -54,7 +54,7 @@ cdef class Lexeme:
            lex.lang = value
    @staticmethod
-    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) noexcept nogil:
+    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        if feat_name < (sizeof(flags_t) * 8):
            if Lexeme.c_check_flag(lex, feat_name):
                return 1
@ -82,7 +82,7 @@ cdef class Lexeme:
            return 0
    @staticmethod
-    cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) noexcept nogil:
+    cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
        cdef flags_t one = 1
        if lexeme.flags & (one << flag_id):
            return True
@ -90,7 +90,7 @@ cdef class Lexeme:
            return False
    @staticmethod
-    cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) noexcept nogil:
+    cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
        cdef flags_t one = 1
        if value:
            lex.flags |= one << flag_id
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,7 +1,7 @@
 # cython: embedsignature=True
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
 from cython.view cimport array as cvarray
 from libc.string cimport memset
 np.import_array()
@ -70,7 +70,7 @@ cdef class Lexeme:
        if isinstance(other, Lexeme):
            a = self.orth
            b = other.orth
-        elif isinstance(other, int):
+        elif isinstance(other, long):
            a = self.orth
            b = other
        elif isinstance(other, str):
@ -104,7 +104,7 @@ cdef class Lexeme:
            # skip PROB, e.g. from lexemes.jsonl
            if isinstance(value, float):
                continue
-            elif isinstance(value, int):
+            elif isinstance(value, (int, long)):
                 Lexeme.set_struct_attr(self.c, attr, value)
            else:
                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
@ -137,11 +137,9 @@ cdef class Lexeme:
        if hasattr(other, "orth"):
            if self.c.orth == other.orth:
                return 1.0
-        elif (
+        elif hasattr(other, "__len__") and len(other) == 1 \
-            hasattr(other, "__len__") and len(other) == 1
+        and hasattr(other[0], "orth"):
-            and hasattr(other[0], "orth")
+            if self.c.orth == other[0].orth:
            and self.c.orth == other[0].orth
        ):
                return 1.0
        if self.vector_norm == 0 or other.vector_norm == 0:
            warnings.warn(Warnings.W008.format(obj="Lexeme"))
@ -164,44 +162,41 @@ cdef class Lexeme:
        vector = self.vector
        return numpy.sqrt((vector**2).sum())
-    @property
+    property vector:
    def vector(self):
        """A real-valued meaning representation.
        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the lexeme's semantics.
        """
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
                raise ValueError(Errors.E010)
            return self.vocab.get_vector(self.c.orth)
-    @vector.setter
+        def __set__(self, vector):
    def vector(self, vector):
            if len(vector) != self.vocab.vectors_length:
                raise ValueError(Errors.E073.format(new_length=len(vector),
                                                    length=self.vocab.vectors_length))
            self.vocab.set_vector(self.c.orth, vector)
-    @property
+    property rank:
    def rank(self):
        """RETURNS (str): Sequential ID of the lexeme's lexical type, used
            to index into tables, e.g. for word vectors."""
        def __get__(self):
            return self.c.id
-    @rank.setter
+        def __set__(self, value):
    def rank(self, value):
            self.c.id = value
-    @property
+    property sentiment:
    def sentiment(self):
        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the lexeme."""
        def __get__(self):
            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
            return sentiment_table.get(self.c.orth, 0.0)
-    @sentiment.setter
+        def __set__(self, float x):
    def sentiment(self, float x):
            if "lexeme_sentiment" not in self.vocab.lookups:
                self.vocab.lookups.add_table("lexeme_sentiment")
            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
@ -219,166 +214,151 @@ cdef class Lexeme:
        """RETURNS (str): The original verbatim text of the lexeme."""
        return self.orth_
-    @property
+    property lower:
    def lower(self):
        """RETURNS (uint64): Lowercase form of the lexeme."""
        def __get__(self):
            return self.c.lower
-    @lower.setter
+        def __set__(self, attr_t x):
    def lower(self, attr_t x):
            self.c.lower = x
-    @property
+    property norm:
    def norm(self):
        """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
            return self.c.norm
-    @norm.setter
+        def __set__(self, attr_t x):
    def norm(self, attr_t x):
            if "lexeme_norm" not in self.vocab.lookups:
                self.vocab.lookups.add_table("lexeme_norm")
            norm_table = self.vocab.lookups.get_table("lexeme_norm")
            norm_table[self.c.orth] = self.vocab.strings[x]
            self.c.norm = x
-    @property
+    property shape:
    def shape(self):
        """RETURNS (uint64): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
            return self.c.shape
-    @shape.setter
+        def __set__(self, attr_t x):
    def shape(self, attr_t x):
            self.c.shape = x
-    @property
+    property prefix:
    def prefix(self):
        """RETURNS (uint64): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
            return self.c.prefix
-    @prefix.setter
+        def __set__(self, attr_t x):
    def prefix(self, attr_t x):
            self.c.prefix = x
-    @property
+    property suffix:
    def suffix(self):
        """RETURNS (uint64): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
            return self.c.suffix
-    @suffix.setter
+        def __set__(self, attr_t x):
    def suffix(self, attr_t x):
            self.c.suffix = x
-    @property
+    property cluster:
    def cluster(self):
        """RETURNS (int): Brown cluster ID."""
        def __get__(self):
            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
            return cluster_table.get(self.c.orth, 0)
-    @cluster.setter
+        def __set__(self, int x):
    def cluster(self, int x):
            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
            cluster_table[self.c.orth] = x
-    @property
+    property lang:
    def lang(self):
        """RETURNS (uint64): Language of the parent vocabulary."""
        def __get__(self):
            return self.c.lang
-    @lang.setter
+        def __set__(self, attr_t x):
    def lang(self, attr_t x):
            self.c.lang = x
-    @property
+    property prob:
    def prob(self):
        """RETURNS (float): Smoothed log probability estimate of the lexeme's
            type."""
        def __get__(self):
            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
            settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
            default_oov_prob = settings_table.get("oov_prob", -20.0)
            return prob_table.get(self.c.orth, default_oov_prob)
-    @prob.setter
+        def __set__(self, float x):
    def prob(self, float x):
            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
            prob_table[self.c.orth] = x
-    @property
+    property lower_:
    def lower_(self):
        """RETURNS (str): Lowercase form of the word."""
        def __get__(self):
            return self.vocab.strings[self.c.lower]
-    @lower_.setter
+        def __set__(self, str x):
    def lower_(self, str x):
            self.c.lower = self.vocab.strings.add(x)
-    @property
+    property norm_:
    def norm_(self):
        """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
            return self.vocab.strings[self.c.norm]
-    @norm_.setter
+        def __set__(self, str x):
    def norm_(self, str x):
            self.norm = self.vocab.strings.add(x)
-    @property
+    property shape_:
    def shape_(self):
        """RETURNS (str): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
            return self.vocab.strings[self.c.shape]
-    @shape_.setter
+        def __set__(self, str x):
    def shape_(self, str x):
            self.c.shape = self.vocab.strings.add(x)
-    @property
+    property prefix_:
    def prefix_(self):
        """RETURNS (str): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.prefix]
-    @prefix_.setter
+        def __set__(self, str x):
    def prefix_(self, str x):
            self.c.prefix = self.vocab.strings.add(x)
-    @property
+    property suffix_:
    def suffix_(self):
        """RETURNS (str): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.suffix]
-    @suffix_.setter
+        def __set__(self, str x):
    def suffix_(self, str x):
            self.c.suffix = self.vocab.strings.add(x)
-    @property
+    property lang_:
    def lang_(self):
        """RETURNS (str): Language of the parent vocabulary."""
        def __get__(self):
            return self.vocab.strings[self.c.lang]
-    @lang_.setter
+        def __set__(self, str x):
    def lang_(self, str x):
            self.c.lang = self.vocab.strings.add(x)
-    @property
+    property flags:
    def flags(self):
        """RETURNS (uint64): Container of the lexeme's binary flags."""
        def __get__(self):
            return self.c.flags
-    @flags.setter
+        def __set__(self, flags_t x):
    def flags(self, flags_t x):
            self.c.flags = x
    @property
@ -386,171 +366,154 @@ cdef class Lexeme:
        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
        return self.orth not in self.vocab.vectors
-    @property
+    property is_stop:
    def is_stop(self):
        """RETURNS (bool): Whether the lexeme is a stop word."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_STOP)
-    @is_stop.setter
+        def __set__(self, bint x):
    def is_stop(self, bint x):
            Lexeme.c_set_flag(self.c, IS_STOP, x)
-    @property
+    property is_alpha:
    def is_alpha(self):
        """RETURNS (bool): Whether the lexeme consists of alphabetic
            characters. Equivalent to `lexeme.text.isalpha()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_ALPHA)
-    @is_alpha.setter
+        def __set__(self, bint x):
    def is_alpha(self, bint x):
            Lexeme.c_set_flag(self.c, IS_ALPHA, x)
-    @property
+    property is_ascii:
    def is_ascii(self):
        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_ASCII)
-    @is_ascii.setter
+        def __set__(self, bint x):
    def is_ascii(self, bint x):
            Lexeme.c_set_flag(self.c, IS_ASCII, x)
-    @property
+    property is_digit:
    def is_digit(self):
        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
            to `lexeme.text.isdigit()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_DIGIT)
-    @is_digit.setter
+        def __set__(self, bint x):
    def is_digit(self, bint x):
            Lexeme.c_set_flag(self.c, IS_DIGIT, x)
-    @property
+    property is_lower:
    def is_lower(self):
        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
            `lexeme.text.islower()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_LOWER)
-    @is_lower.setter
+        def __set__(self, bint x):
    def is_lower(self, bint x):
            Lexeme.c_set_flag(self.c, IS_LOWER, x)
-    @property
+    property is_upper:
    def is_upper(self):
        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
            `lexeme.text.isupper()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_UPPER)
-    @is_upper.setter
+        def __set__(self, bint x):
    def is_upper(self, bint x):
            Lexeme.c_set_flag(self.c, IS_UPPER, x)
-    @property
+    property is_title:
    def is_title(self):
        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
            `lexeme.text.istitle()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_TITLE)
-    @is_title.setter
+        def __set__(self, bint x):
    def is_title(self, bint x):
            Lexeme.c_set_flag(self.c, IS_TITLE, x)
-    @property
+    property is_punct:
    def is_punct(self):
        """RETURNS (bool): Whether the lexeme is punctuation."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_PUNCT)
-    @is_punct.setter
+        def __set__(self, bint x):
    def is_punct(self, bint x):
            Lexeme.c_set_flag(self.c, IS_PUNCT, x)
-    @property
+    property is_space:
    def is_space(self):
        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
            Equivalent to `lexeme.text.isspace()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_SPACE)
-    @is_space.setter
+        def __set__(self, bint x):
    def is_space(self, bint x):
            Lexeme.c_set_flag(self.c, IS_SPACE, x)
-    @property
+    property is_bracket:
    def is_bracket(self):
        """RETURNS (bool): Whether the lexeme is a bracket."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_BRACKET)
-    @is_bracket.setter
+        def __set__(self, bint x):
    def is_bracket(self, bint x):
            Lexeme.c_set_flag(self.c, IS_BRACKET, x)
-    @property
+    property is_quote:
    def is_quote(self):
        """RETURNS (bool): Whether the lexeme is a quotation mark."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_QUOTE)
-    @is_quote.setter
+        def __set__(self, bint x):
    def is_quote(self, bint x):
            Lexeme.c_set_flag(self.c, IS_QUOTE, x)
-    @property
+    property is_left_punct:
    def is_left_punct(self):
        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
-    @is_left_punct.setter
+        def __set__(self, bint x):
    def is_left_punct(self, bint x):
            Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
-    @property
+    property is_right_punct:
    def is_right_punct(self):
        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
-    @is_right_punct.setter
+        def __set__(self, bint x):
    def is_right_punct(self, bint x):
            Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
-    @property
+    property is_currency:
    def is_currency(self):
        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_CURRENCY)
-    @is_currency.setter
+        def __set__(self, bint x):
    def is_currency(self, bint x):
            Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
-    @property
+    property like_url:
    def like_url(self):
        """RETURNS (bool): Whether the lexeme resembles a URL."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, LIKE_URL)
-    @like_url.setter
+        def __set__(self, bint x):
    def like_url(self, bint x):
            Lexeme.c_set_flag(self.c, LIKE_URL, x)
-    @property
+    property like_num:
    def like_num(self):
        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
            "10", "ten", etc.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, LIKE_NUM)
-    @like_num.setter
+        def __set__(self, bint x):
    def like_num(self, bint x):
            Lexeme.c_set_flag(self.c, LIKE_NUM, x)
-    @property
+    property like_email:
    def like_email(self):
        """RETURNS (bool): Whether the lexeme resembles an email address."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
-    @like_email.setter
+        def __set__(self, bint x):
    def like_email(self, bint x):
            Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
--- a/spacy/matcher/init.py
+++ b/spacy/matcher/init.py
@ -3,4 +3,4 @@ from .levenshtein import levenshtein
 from .matcher import Matcher
 from .phrasematcher import PhraseMatcher
-__all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]
+__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -1,4 +1,4 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
 import warnings
 from collections import defaultdict
 from itertools import product
@ -108,7 +108,7 @@ cdef class DependencyMatcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self.has_key(key)  # no-cython-lint: W601
+        return self.has_key(key)
    def _validate_input(self, pattern, key):
        idx = 0
@ -129,7 +129,6 @@ cdef class DependencyMatcher:
            else:
                required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
                relation_keys = set(relation.keys())
                # Identify required keys that have not been specified
                missing = required_keys - relation_keys
                if missing:
                    missing_txt = ", ".join(list(missing))
@ -137,13 +136,6 @@ cdef class DependencyMatcher:
                        required=required_keys,
                        missing=missing_txt
                    ))
                # Identify additional, unsupported keys
                unsupported = relation_keys - required_keys
                if unsupported:
                    unsupported_txt = ", ".join(list(unsupported))
                    warnings.warn(Warnings.W126.format(
                        unsupported=unsupported_txt
                    ))
                if (
                    relation["RIGHT_ID"] in visited_nodes
                    or relation["LEFT_ID"] not in visited_nodes
@ -272,7 +264,7 @@ cdef class DependencyMatcher:
    def remove(self, key):
        key = self._normalize_key(key)
-        if key not in self._patterns:
+        if not key in self._patterns:
            raise ValueError(Errors.E175.format(key=key))
        self._patterns.pop(key)
        self._raw_patterns.pop(key)
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True, language_level=3
+# cython: profile=True, binding=True, infer_types=True
 from cpython.object cimport PyObject
 from libc.stdint cimport int64_t
@ -27,5 +27,6 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
@registry.misc("spacy.levenshtein_compare.v1")
 def make_levenshtein_compare():
    return levenshtein_compare
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]`