Merge branch 'develop' into feature/pymorphy-lemmatizer-diacritics

2026-01-09 02:01:22 +03:00 · 2023-08-01 16:16:53 +02:00 · 2023-08-01 16:16:53 +02:00 · 135a28a89d
commit 135a28a89d
parent b1e47b50b9 0fe43f40f1
596 changed files with 10498 additions and 5162 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,118 +0,0 @@
-parameters:
-  python_version: ''
-  architecture: 'x64'
-  num_build_jobs: 2
-
-steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: ${{ parameters.python_version }}
-      architecture: ${{ parameters.architecture }}
-      allowUnstable: true
-
-  - bash: |
-      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
-    displayName: 'Set variables'
-
-  - script: |
-      python -m pip install -U build pip setuptools
-      python -m pip install -U -r requirements.txt
-    displayName: "Install dependencies"
-
-  - script: |
-      python -m build --sdist
-    displayName: "Build sdist"
-
-  - script: |
-      python -m mypy spacy
-    displayName: 'Run mypy'
-    condition: ne(variables['python_version'], '3.6')
-
-  - task: DeleteFiles@1
-    inputs:
-      contents: "spacy"
-    displayName: "Delete source directory"
-
-  - task: DeleteFiles@1
-    inputs:
-      contents: "*.egg-info"
-    displayName: "Delete egg-info directory"
-
-  - script: |
-      python -m pip freeze > installed.txt
-      python -m pip uninstall -y -r installed.txt
-    displayName: "Uninstall all packages"
-
-  - bash: |
-      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
-    displayName: "Install from sdist"
-
-  - script: |
-      python -W error -c "import spacy"
-    displayName: "Test import"
-
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -W error -m spacy info ca_core_news_sm | grep -q download_url
-    displayName: 'Test download_url in info CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-    displayName: 'Test no warnings on load (#11713)'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
-    displayName: 'Test convert CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m spacy init config -p ner -l ca ner.cfg
-      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
-    displayName: 'Test debug config CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      # will have errors due to sparse data, check for summary in output
-      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
-    displayName: 'Test debug data CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
-    displayName: 'Test train CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      python -m pytest --pyargs spacy -W error
-    displayName: "Run CPU tests"
-
-  - script: |
-      python -m pip install 'spacy[apple]'
-      python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -37,10 +37,20 @@ jobs:
        run: |
          python -m pip install black -c requirements.txt
          python -m black spacy --check
+      - name: isort
+        run: |
+          python -m pip install isort -c requirements.txt
+          python -m isort spacy --check
      - name: flake8
        run: |
          python -m pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+      - name: cython-lint
+        run: |
+          python -m pip install cython-lint -c requirements.txt
+          # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
+          cython-lint spacy --ignore E501,W291,E266
+
  tests:
    name: Test
    needs: Validate
@ -107,22 +117,22 @@ jobs:
      - name: Test import
        run: python -W error -c "import spacy"

-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+#      - name: "Test download CLI"
+#        run: |
+#          python -m spacy download ca_core_news_sm
+#          python -m spacy download ca_core_news_md
+#          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test download_url in info CLI"
+#        run: |
+#          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test no warnings on load (#11713)"
+#        run: |
+#          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'

      - name: "Test convert CLI"
        run: |
@ -146,17 +156,17 @@ jobs:
          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
        if: matrix.python_version == '3.9'

-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+#      - name: "Test assemble CLI"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test assemble CLI vectors warning"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#        if: matrix.python_version == '3.9'

      - name: "Install test requirements"
        run: |
@ -165,6 +175,7 @@ jobs:
      - name: "Run CPU tests"
        run: |
          python -m pytest --pyargs spacy -W error
+        if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"

      - name: "Run CPU tests with thinc-apple-ops"
        run: |
--- a/4
+++ b/4
@ -1,11 +1,11 @@
 SHELL := /bin/bash

 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
+override SPACY_EXTRAS = spacy-lookups-data==1.0.3
 endif

 ifndef PYVER
-override PYVER = 3.6
+override PYVER = 3.8
 endif

 VENV := ./env$(PYVER)
--- a/README.md
+++ b/README.md
@ -35,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos

 ## 📖 Documentation

-| Documentation                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                              |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ⭐️ **[spaCy 101]**                                                                                                                                                                                                       | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                                            |
-| 📚 **[Usage Guides]**                                                                                                                                                                                                     | How to use spaCy and its features.                                                                                                                                                                                                                                                                                           |
-| 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                               |
-| 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                          |
-| 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                      |
-| 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                        |
-| 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                               |
-| 👩‍🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                      |
-| 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                    |
-| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
-| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
+| Documentation                 |                                                                        |
+| ----------------------------- | ---------------------------------------------------------------------- |
+| ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      |
+| 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     |
+| 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         |
+| 🪐 **[Project Templates]**    | End-to-end workflows you can clone, modify and run.                    |
+| 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                |
+| 📦 **[Models]**               | Download trained pipelines for spaCy.                                  |
+| 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         |
+| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
+| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
+| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
+| 🛠 **[Changelog]** | Changes and version history. |
+| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
 | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |

@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
 [api reference]: https://spacy.io/api/
 [models]: https://spacy.io/models
 [universe]: https://spacy.io/universe
+[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
 [videos]: https://www.youtube.com/c/ExplosionAI
 [online course]: https://course.spacy.io
 [project templates]: https://github.com/explosion/projects
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md

-
 ## 💬 Where to ask questions

 The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -1,120 +0,0 @@
-trigger:
-  batch: true
-  branches:
-    include:
-      - "*"
-    exclude:
-      - "spacy.io"
-      - "nightly.spacy.io"
-      - "v2.spacy.io"
-  paths:
-    exclude:
-      - "website/*"
-      - "*.md"
-      - "*.mdx"
-      - ".github/workflows/*"
-pr:
-  paths:
-    exclude:
-      - "*.md"
-      - "*.mdx"
-      - "website/docs/*"
-      - "website/src/*"
-      - "website/meta/*.tsx"
-      - "website/meta/*.mjs"
-      - "website/meta/languages.json"
-      - "website/meta/site.json"
-      - "website/meta/sidebars.json"
-      - "website/meta/type-annotations.json"
-      - "website/pages/*"
-      - ".github/workflows/*"
-
-jobs:
-  # Check formatting and linting. Perform basic checks for most important errors
-  # (syntax etc.) Uses the config defined in setup.cfg and overwrites the
-  # selected codes.
-  - job: "Validate"
-    pool:
-      vmImage: "ubuntu-latest"
-    steps:
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: "3.7"
-      - script: |
-          pip install black -c requirements.txt
-          python -m black spacy --check
-        displayName: "black"
-      - script: |
-          pip install flake8==5.0.4
-          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
-        displayName: "flake8"
-      - script: |
-          python .github/validate_universe_json.py website/meta/universe.json
-        displayName: 'Validate website/meta/universe.json'
-
-  - job: "Test"
-    dependsOn: "Validate"
-    strategy:
-      matrix:
-        # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-20.04"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-20.04"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
-        #        Python38Linux:
-        #          imageName: "ubuntu-latest"
-        #          python.version: "3.8"
-        #        Python38Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.8"
-        Python38Mac:
-          imageName: "macos-latest"
-          python.version: "3.8"
-        Python39Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.9"
-        #        Python39Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.9"
-        #        Python39Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.9"
-        #        Python310Linux:
-        #          imageName: "ubuntu-latest"
-        #          python.version: "3.10"
-        Python310Windows:
-          imageName: "windows-latest"
-          python.version: "3.10"
-        #        Python310Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.10"
-        Python311Linux:
-          imageName: 'ubuntu-latest'
-          python.version: '3.11'
-        Python311Windows:
-          imageName: 'windows-latest'
-          python.version: '3.11'
-        Python311Mac:
-          imageName: 'macos-latest'
-          python.version: '3.11'
-      maxParallel: 4
-    pool:
-      vmImage: $(imageName)
-    steps:
-      - template: .github/azure-steps.yml
-        parameters:
-          python_version: '$(python.version)'
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -3,7 +3,4 @@ numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
 numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
-numpy==1.19.3; python_version=='3.9'
-numpy==1.21.3; python_version=='3.10'
-numpy==1.23.2; python_version=='3.11'
-numpy; python_version>='3.12'
+numpy>=1.25.0; python_version>='3.9'
--- a/extra/DEVELOPER_DOCS/Listeners.md
+++ b/extra/DEVELOPER_DOCS/Listeners.md
@ -1,14 +1,17 @@
 # Listeners

-1. [Overview](#1-overview)
-2. [Initialization](#2-initialization)
-   - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
-   - [B. Shape inference](#2b-shape-inference)
-3. [Internal communication](#3-internal-communication)
-   - [A. During prediction](#3a-during-prediction)
-   - [B. During training](#3b-during-training)
-   - [C. Frozen components](#3c-frozen-components)
-4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
+- [1. Overview](#1-overview)
+- [2. Initialization](#2-initialization)
+  - [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
+  - [2B. Shape inference](#2b-shape-inference)
+- [3. Internal communication](#3-internal-communication)
+  - [3A. During prediction](#3a-during-prediction)
+  - [3B. During training](#3b-during-training)
+    - [Training with multiple listeners](#training-with-multiple-listeners)
+  - [3C. Frozen components](#3c-frozen-components)
+    - [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
+    - [The upstream component is frozen](#the-upstream-component-is-frozen)
+- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)

 ## 1. Overview

@ -62,7 +65,7 @@ of this `find_listener()` method will specifically identify sublayers of a model

 If it's a Transformer-based pipeline, a
 [`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
-has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener` 
+has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
 sublayers of downstream components.

 ### 2B. Shape inference
@ -154,7 +157,7 @@ as a tagger or a parser. This used to be impossible before 3.1, but has become s
 embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
 list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.

-However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related 
+However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
 listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.

 #### The upstream component is frozen
@ -216,5 +219,17 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
 ```

 The new config and model are then properly stored on the `nlp` object.
-Note that this functionality (running the replacement for a transformer listener) was broken prior to 
+Note that this functionality (running the replacement for a transformer listener) was broken prior to
 `spacy-transformers` 1.0.5.
+
+In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
+the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
+the method only passes these extra arguments for callbacks that support them:
+
+```
+def replace_listener_pre_37(copied_tok2vec_model):
+  ...
+
+def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
+  ...
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,6 +6,10 @@ requires = [
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
    "thinc>=8.1.8,<8.2.0",
-    "numpy>=1.15.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
--- a/requirements.txt
+++ b/requirements.txt
@ -9,11 +9,13 @@ murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.8.0
+typer>=0.3.0,<0.10.0
 pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
+weasel>=0.1.0,<0.2.0
 # Third party dependencies
-numpy>=1.15.0
+numpy>=1.15.0; python_version < "3.9"
+numpy>=1.19.0; python_version >= "3.9"
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
@ -38,3 +40,5 @@ types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
 black==22.3.0
+cython-lint>=0.15.0; python_version >= "3.7"
+isort>=5.0,<6.0
--- a/setup.cfg
+++ b/setup.cfg
@ -32,8 +32,13 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.6
+# NOTE: This section is superseded by pyproject.toml and will be removed in
+# spaCy v4
 setup_requires =
    cython>=0.25,<3.0
+    # The newest supported pip for python 3.6 has bugs related to markers in
+    # this section, so this does not contain the same constraints as
+    # pyproject.toml
    numpy>=1.15.0
    # We also need our Cython packages here to compile against
    cymem>=2.0.2,<2.1.0
@ -51,12 +56,14 @@ install_requires =
    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
+    weasel>=0.1.0,<0.2.0
    # Third-party dependencies
-    typer>=0.3.0,<0.8.0
+    typer>=0.3.0,<0.10.0
    pathy>=0.10.0
    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
-    numpy>=1.15.0
+    numpy>=1.15.0; python_version < "3.9"
+    numpy>=1.19.0; python_version >= "3.9"
    requests>=2.13.0,<3.0.0
    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
    jinja2
@ -75,8 +82,6 @@ lookups =
    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
    spacy_transformers>=1.1.2,<1.3.0
-ray =
-    spacy_ray>=0.1.0,<1.0.0
 cuda =
    cupy>=5.0.0b4,<13.0.0
 cuda80 =
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,6 +1,6 @@
-from typing import Union, Iterable, Dict, Any
-from pathlib import Path
 import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, Union

 # set library-specific custom warning handling before doing anything else
 from .errors import setup_default_warnings
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
 setup_default_warnings()  # noqa: E402

 # These are imported as part of the API
-from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
-from thinc.api import Config
+from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401

 from . import pipeline  # noqa: F401
-from .cli.info import info  # noqa: F401
-from .glossary import explain  # noqa: F401
-from .about import __version__  # noqa: F401
-from .util import registry, logger  # noqa: F401
-
-from .errors import Errors
-from .language import Language
-from .vocab import Vocab
 from . import util
-
+from .about import __version__  # noqa: F401
+from .cli.info import info  # noqa: F401
+from .errors import Errors
+from .glossary import explain  # noqa: F401
+from .language import Language
+from .util import logger, registry  # noqa: F401
+from .vocab import Vocab

 if sys.maxunicode == 65535:
    raise SystemError(Errors.E130)
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,7 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "3.7.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__projects__ = "https://github.com/explosion/projects"
-__projects_branch__ = "v3"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,6 +1,7 @@
 # Reserve 64 values for flag features
 from . cimport symbols

+
 cdef enum attr_id_t:
    NULL_ATTR
    IS_ALPHA
@ -95,4 +96,4 @@ cdef enum attr_id_t:
    ENT_ID = symbols.ENT_ID

    IDX
-    SENT_END
+    SENT_END
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        if "pos" in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")
+            morphs = stringy_attrs.pop("morph")  # no-cython-lint
        if "number" in stringy_attrs:
            stringy_attrs.pop("number")
        if "tenspect" in stringy_attrs:
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,35 +1,28 @@
 from wasabi import msg

 from ._util import app, setup_cli  # noqa: F401
+from .apply import apply  # noqa: F401
+from .assemble import assemble_cli  # noqa: F401

 # These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 # are registered automatically and won't have to be imported here.
 from .benchmark_speed import benchmark_speed_cli  # noqa: F401
+from .convert import convert  # noqa: F401
+from .debug_config import debug_config  # noqa: F401
+from .debug_data import debug_data  # noqa: F401
+from .debug_diff import debug_diff  # noqa: F401
+from .debug_model import debug_model  # noqa: F401
 from .download import download  # noqa: F401
+from .evaluate import evaluate  # noqa: F401
+from .find_threshold import find_threshold  # noqa: F401
 from .info import info  # noqa: F401
+from .init_config import fill_config, init_config  # noqa: F401
+from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .package import package  # noqa: F401
+from .pretrain import pretrain  # noqa: F401
 from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
-from .assemble import assemble_cli  # noqa: F401
-from .pretrain import pretrain  # noqa: F401
-from .debug_data import debug_data  # noqa: F401
-from .debug_config import debug_config  # noqa: F401
-from .debug_model import debug_model  # noqa: F401
-from .debug_diff import debug_diff  # noqa: F401
-from .evaluate import evaluate  # noqa: F401
-from .apply import apply  # noqa: F401
-from .convert import convert  # noqa: F401
-from .init_pipeline import init_pipeline_cli  # noqa: F401
-from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
-from .project.clone import project_clone  # noqa: F401
-from .project.assets import project_assets  # noqa: F401
-from .project.run import project_run  # noqa: F401
-from .project.dvc import project_update_dvc  # noqa: F401
-from .project.push import project_push  # noqa: F401
-from .project.pull import project_pull  # noqa: F401
-from .project.document import project_document  # noqa: F401
-from .find_threshold import find_threshold  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -1,26 +1,45 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
+import os
+import shutil
+import sys
+from configparser import InterpolationError
+from contextlib import contextmanager
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
+import srsly
 import typer
 from click import NoSuchOption
 from click.parser import split_arg_string
-from typer.main import get_command
-from contextlib import contextmanager
 from thinc.api import Config, ConfigValidationError, require_gpu
 from thinc.util import gpu_is_available
-from configparser import InterpolationError
-import os
+from typer.main import get_command
+from wasabi import Printer, msg
+from weasel import app as project_cli

-from ..compat import Literal
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
+from ..compat import Literal
+from ..schemas import validate
+from ..util import (
+    ENV_VARS,
+    SimpleFrozenDict,
+    import_file,
+    is_compatible_version,
+    logger,
+    make_tempdir,
+    registry,
+    run_command,
+)

 if TYPE_CHECKING:
    from pathy import FluidPath  # noqa: F401
@ -30,7 +49,6 @@ SDIST_SUFFIX = ".tar.gz"
 WHEEL_SUFFIX = "-py3-none-any.whl"

 PROJECT_FILE = "project.yml"
-PROJECT_LOCK = "project.lock"
 COMMAND = "python -m spacy"
 NAME = "spacy"
 HELP = """spaCy Command-line Interface
@ -56,11 +74,10 @@ Opt = typer.Option

 app = typer.Typer(name=NAME, help=HELP)
 benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
-project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)

-app.add_typer(project_cli)
+app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
 app.add_typer(debug_cli)
 app.add_typer(benchmark_cli)
 app.add_typer(init_cli)
@ -135,148 +152,6 @@ def _parse_override(value: Any) -> Any:
        return str(value)


-def load_project_config(
-    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
-) -> Dict[str, Any]:
-    """Load the project.yml file from a directory and validate it. Also make
-    sure that all directories defined in the config exist.
-
-    path (Path): The path to the project directory.
-    interpolate (bool): Whether to substitute project variables.
-    overrides (Dict[str, Any]): Optional config overrides.
-    RETURNS (Dict[str, Any]): The loaded project.yml.
-    """
-    config_path = path / PROJECT_FILE
-    if not config_path.exists():
-        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
-    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
-    try:
-        config = srsly.read_yaml(config_path)
-    except ValueError as e:
-        msg.fail(invalid_err, e, exits=1)
-    errors = validate(ProjectConfigSchema, config)
-    if errors:
-        msg.fail(invalid_err)
-        print("\n".join(errors))
-        sys.exit(1)
-    validate_project_version(config)
-    validate_project_commands(config)
-    if interpolate:
-        err = f"{PROJECT_FILE} validation error"
-        with show_validation_error(title=err, hint_fill=False):
-            config = substitute_project_variables(config, overrides)
-    # Make sure directories defined in config exist
-    for subdir in config.get("directories", []):
-        dir_path = path / subdir
-        if not dir_path.exists():
-            dir_path.mkdir(parents=True)
-    return config
-
-
-def substitute_project_variables(
-    config: Dict[str, Any],
-    overrides: Dict[str, Any] = SimpleFrozenDict(),
-    key: str = "vars",
-    env_key: str = "env",
-) -> Dict[str, Any]:
-    """Interpolate variables in the project file using the config system.
-
-    config (Dict[str, Any]): The project config.
-    overrides (Dict[str, Any]): Optional config overrides.
-    key (str): Key containing variables in project config.
-    env_key (str): Key containing environment variable mapping in project config.
-    RETURNS (Dict[str, Any]): The interpolated project config.
-    """
-    config.setdefault(key, {})
-    config.setdefault(env_key, {})
-    # Substitute references to env vars with their values
-    for config_var, env_var in config[env_key].items():
-        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
-    # Need to put variables in the top scope again so we can have a top-level
-    # section "project" (otherwise, a list of commands in the top scope wouldn't)
-    # be allowed by Thinc's config system
-    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
-    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
-    interpolated = cfg.interpolate()
-    return dict(interpolated["project"])
-
-
-def validate_project_version(config: Dict[str, Any]) -> None:
-    """If the project defines a compatible spaCy version range, chec that it's
-    compatible with the current version of spaCy.
-
-    config (Dict[str, Any]): The loaded config.
-    """
-    spacy_version = config.get("spacy_version", None)
-    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
-        err = (
-            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
-            f"that's not compatible with the version of spaCy you're running "
-            f"({about.__version__}). You can edit version requirement in the "
-            f"{PROJECT_FILE} to load it, but the project may not run as expected."
-        )
-        msg.fail(err, exits=1)
-
-
-def validate_project_commands(config: Dict[str, Any]) -> None:
-    """Check that project commands and workflows are valid, don't contain
-    duplicates, don't clash  and only refer to commands that exist.
-
-    config (Dict[str, Any]): The loaded config.
-    """
-    command_names = [cmd["name"] for cmd in config.get("commands", [])]
-    workflows = config.get("workflows", {})
-    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
-    if duplicates:
-        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
-        msg.fail(err, exits=1)
-    for workflow_name, workflow_steps in workflows.items():
-        if workflow_name in command_names:
-            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
-            msg.fail(err, exits=1)
-        for step in workflow_steps:
-            if step not in command_names:
-                msg.fail(
-                    f"Unknown command specified in workflow '{workflow_name}': {step}",
-                    f"Workflows can only refer to commands defined in the 'commands' "
-                    f"section of the {PROJECT_FILE}.",
-                    exits=1,
-                )
-
-
-def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
-    """Get the hash for a JSON-serializable object.
-
-    data: The data to hash.
-    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
-    RETURNS (str): The hash.
-    """
-    if isinstance(data, dict):
-        data = {k: v for k, v in data.items() if k not in exclude}
-    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
-    return hashlib.md5(data_str).hexdigest()
-
-
-def get_checksum(path: Union[Path, str]) -> str:
-    """Get the checksum for a file or directory given its file path. If a
-    directory path is provided, this uses all files in that directory.
-
-    path (Union[Path, str]): The file or directory path.
-    RETURNS (str): The checksum.
-    """
-    path = Path(path)
-    if not (path.is_file() or path.is_dir()):
-        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
-    if path.is_file():
-        return hashlib.md5(Path(path).read_bytes()).hexdigest()
-    else:
-        # TODO: this is currently pretty slow
-        dir_checksum = hashlib.md5()
-        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
-            dir_checksum.update(sub_file.read_bytes())
-        return dir_checksum.hexdigest()
-
-
@contextmanager
 def show_validation_error(
    file_path: Optional[Union[str, Path]] = None,
@ -334,166 +209,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)


-def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
-    """Upload a file.
-
-    src (Path): The source path.
-    url (str): The destination URL to upload to.
-    """
-    import smart_open
-
-    # Create parent directories for local paths
-    if isinstance(dest, Path):
-        if not dest.parent.exists():
-            dest.parent.mkdir(parents=True)
-
-    dest = str(dest)
-    with smart_open.open(dest, mode="wb") as output_file:
-        with src.open(mode="rb") as input_file:
-            output_file.write(input_file.read())
-
-
-def download_file(
-    src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
-) -> None:
-    """Download a file using smart_open.
-
-    url (str): The URL of the file.
-    dest (Path): The destination path.
-    force (bool): Whether to force download even if file exists.
-        If False, the download will be skipped.
-    """
-    import smart_open
-
-    if dest.exists() and not force:
-        return None
-    src = str(src)
-    with smart_open.open(src, mode="rb", compression="disable") as input_file:
-        with dest.open(mode="wb") as output_file:
-            shutil.copyfileobj(input_file, output_file)
-
-
-def ensure_pathy(path):
-    """Temporary helper to prevent importing Pathy globally (which can cause
-    slow and annoying Google Cloud warning)."""
-    from pathy import Pathy  # noqa: F811
-
-    return Pathy.fluid(path)
-
-
-def git_checkout(
-    repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
-):
-    git_version = get_git_version()
-    if dest.exists():
-        msg.fail("Destination of checkout must not exist", exits=1)
-    if not dest.parent.exists():
-        msg.fail("Parent of destination of checkout must exist", exits=1)
-    if sparse and git_version >= (2, 22):
-        return git_sparse_checkout(repo, subpath, dest, branch)
-    elif sparse:
-        # Only show warnings if the user explicitly wants sparse checkout but
-        # the Git version doesn't support it
-        err_old = (
-            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-            f"that doesn't fully support sparse checkout yet."
-        )
-        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
-        msg.warn(
-            f"{err_unk if git_version == (0, 0) else err_old} "
-            f"This means that more files than necessary may be downloaded "
-            f"temporarily. To only download the files needed, make sure "
-            f"you're using Git v2.22 or above."
-        )
-    with make_tempdir() as tmp_dir:
-        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
-        run_command(cmd, capture=True)
-        # We need Path(name) to make sure we also support subdirectories
-        try:
-            source_path = tmp_dir / Path(subpath)
-            if not is_subpath_of(tmp_dir, source_path):
-                err = f"'{subpath}' is a path outside of the cloned repository."
-                msg.fail(err, repo, exits=1)
-            shutil.copytree(str(source_path), str(dest))
-        except FileNotFoundError:
-            err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
-            msg.fail(err, repo, exits=1)
-
-
-def git_sparse_checkout(repo, subpath, dest, branch):
-    # We're using Git, partial clone and sparse checkout to
-    # only clone the files we need
-    # This ends up being RIDICULOUS. omg.
-    # So, every tutorial and SO post talks about 'sparse checkout'...But they
-    # go and *clone* the whole repo. Worthless. And cloning part of a repo
-    # turns out to be completely broken. The only way to specify a "path" is..
-    # a path *on the server*? The contents of which, specifies the paths. Wat.
-    # Obviously this is hopelessly broken and insecure, because you can query
-    # arbitrary paths on the server! So nobody enables this.
-    # What we have to do is disable *all* files. We could then just checkout
-    # the path, and it'd "work", but be hopelessly slow...Because it goes and
-    # transfers every missing object one-by-one. So the final piece is that we
-    # need to use some weird git internals to fetch the missings in bulk, and
-    # *that* we can do by path.
-    # We're using Git and sparse checkout to only clone the files we need
-    with make_tempdir() as tmp_dir:
-        # This is the "clone, but don't download anything" part.
-        cmd = (
-            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
-            f"-b {branch} --filter=blob:none"
-        )
-        run_command(cmd)
-        # Now we need to find the missing filenames for the subpath we want.
-        # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
-        ret = run_command(cmd, capture=True)
-        git_repo = _http_to_git(repo)
-        # Now pass those missings into another bit of git internals
-        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        if not missings:
-            err = (
-                f"Could not find any relevant files for '{subpath}'. "
-                f"Did you specify a correct and complete path within repo '{repo}' "
-                f"and branch {branch}?"
-            )
-            msg.fail(err, exits=1)
-        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-        run_command(cmd, capture=True)
-        # And finally, we can checkout our subpath
-        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        run_command(cmd, capture=True)
-
-        # Get a subdirectory of the cloned path, if appropriate
-        source_path = tmp_dir / Path(subpath)
-        if not is_subpath_of(tmp_dir, source_path):
-            err = f"'{subpath}' is a path outside of the cloned repository."
-            msg.fail(err, repo, exits=1)
-
-        shutil.move(str(source_path), str(dest))
-
-
-def git_repo_branch_exists(repo: str, branch: str) -> bool:
-    """Uses 'git ls-remote' to check if a repository and branch exists
-
-    repo (str): URL to get repo.
-    branch (str): Branch on repo to check.
-    RETURNS (bool): True if repo:branch exists.
-    """
-    get_git_version()
-    cmd = f"git ls-remote {repo} {branch}"
-    # We might be tempted to use `--exit-code` with `git ls-remote`, but
-    # `run_command` handles the `returncode` for us, so we'll rely on
-    # the fact that stdout returns '' if the requested branch doesn't exist
-    ret = run_command(cmd, capture=True)
-    exists = ret.stdout != ""
-    return exists
-
-
 def get_git_version(
    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 ) -> Tuple[int, int]:
    """Get the version of git and raise an error if calling 'git --version' fails.
-
    error (str): The error message to show.
    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
        (0, 0) if the version couldn't be determined.
@ -509,30 +228,6 @@ def get_git_version(
    return int(version[0]), int(version[1])


-def _http_to_git(repo: str) -> str:
-    if repo.startswith("http://"):
-        repo = repo.replace(r"http://", r"https://")
-    if repo.startswith(r"https://"):
-        repo = repo.replace("https://", "git@").replace("/", ":", 1)
-        if repo.endswith("/"):
-            repo = repo[:-1]
-        repo = f"{repo}.git"
-    return repo
-
-
-def is_subpath_of(parent, child):
-    """
-    Check whether `child` is a path contained within `parent`.
-    """
-    # Based on https://stackoverflow.com/a/37095733 .
-
-    # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
-    # we can stop using crusty old os.path functions.
-    parent_realpath = os.path.realpath(parent)
-    child_realpath = os.path.realpath(child)
-    return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
-
-
@overload
 def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
    ...
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -1,18 +1,15 @@
-import tqdm
-import srsly
-
 from itertools import chain
 from pathlib import Path
-from typing import Optional, List, Iterable, cast, Union
+from typing import Iterable, List, Optional, Union, cast

+import srsly
+import tqdm
 from wasabi import msg

-from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
-
 from ..tokens import Doc, DocBin
-from ..vocab import Vocab
 from ..util import ensure_path, load_model
-
+from ..vocab import Vocab
+from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory

 path_help = """Location of the documents to predict on.
 Can be a single file in .spacy format or a .jsonl file.
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@ -1,13 +1,20 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
 import logging
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg

-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
 from .. import util
 from ..util import get_sourced_components, load_model_from_config
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code,
+    parse_config_overrides,
+    show_validation_error,
+)


@app.command(
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@ -1,11 +1,12 @@
-from typing import Iterable, List, Optional
 import random
-from itertools import islice
-import numpy
-from pathlib import Path
 import time
-from tqdm import tqdm
+from itertools import islice
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+import numpy
 import typer
+from tqdm import tqdm
 from wasabi import msg

 from .. import util
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,18 +1,22 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, Union
-from enum import Enum
-from pathlib import Path
-from wasabi import Printer
-import srsly
+import itertools
 import re
 import sys
-import itertools
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Iterable, Mapping, Optional, Union
+
+import srsly
+from wasabi import Printer

-from ._util import app, Arg, Opt, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
-from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
-from ..training.converters import conllu_to_docs
-
+from ..training import docs_to_json
+from ..training.converters import (
+    conll_ner_to_docs,
+    conllu_to_docs,
+    iob_to_docs,
+    json_to_docs,
+)
+from ._util import Arg, Opt, app, walk_directory

 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -1,15 +1,22 @@
-from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
-from wasabi import msg, table
+from typing import Any, Dict, List, Optional, Union
+
+import typer
 from thinc.api import Config
 from thinc.config import VARIABLE_RE
-import typer
+from wasabi import msg, table

-from ._util import Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli
+from .. import util
 from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
 from ..util import registry
-from .. import util
+from ._util import (
+    Arg,
+    Opt,
+    debug_cli,
+    import_code,
+    parse_config_overrides,
+    show_validation_error,
+)


@debug_cli.command(
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -1,31 +1,49 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
-import numpy
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+    cast,
+    overload,
+)

-from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, _format_number
-from ..training import Example, remove_bilu_prefix
-from ..training.initialize import get_sourced_components
-from ..schemas import ConfigSchemaTraining
-from ..pipeline import TrainablePipe
+import numpy
+import srsly
+import typer
+from wasabi import MESSAGES, Printer, msg
+
+from .. import util
+from ..compat import Literal
+from ..language import Language
+from ..morphology import Morphology
+from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
+from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer, SpanCategorizer
-from ..pipeline._edit_tree_internals.edit_trees import EditTrees
-from ..morphology import Morphology
-from ..language import Language
+from ..schemas import ConfigSchemaTraining
+from ..training import Example, remove_bilu_prefix
+from ..training.initialize import get_sourced_components
 from ..util import registry, resolve_dot_names
-from ..compat import Literal
 from ..vectors import Mode as VectorsMode
-from .. import util
-
+from ._util import (
+    Arg,
+    Opt,
+    _format_number,
+    app,
+    debug_cli,
+    import_code,
+    parse_config_overrides,
+    show_validation_error,
+)

 # Minimum number of expected occurrences of NER label in data to train new label
 NEW_LABEL_THRESHOLD = 50
@ -212,7 +230,7 @@ def debug_data(
    else:
        msg.info("No word vectors present in the package")

-    if "spancat" in factory_names:
+    if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
        model_labels_spancat = _get_labels_from_spancat(nlp)
        has_low_data_warning = False
        has_no_neg_warning = False
@ -830,7 +848,7 @@ def _compile_gold(
                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
-        if "spancat" in factory_names:
+        if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
            for spans_key in list(eg.reference.spans.keys()):
                # Obtain the span frequency
                if spans_key not in data["spancat"]:
@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
    pipe_names = [
        pipe_name
        for pipe_name in nlp.pipe_names
-        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+        if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
    ]
    labels: Dict[str, Set[str]] = {}
    for pipe_name in pipe_names:
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@ -1,13 +1,13 @@
+from pathlib import Path
 from typing import Optional

 import typer
-from wasabi import Printer, diff_strings, MarkdownRenderer
-from pathlib import Path
 from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, diff_strings

-from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
 from ..util import load_config
-from .init_config import init_config, Optimizations
+from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
+from .init_config import Optimizations, init_config


@debug_cli.command(
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -1,19 +1,32 @@
-from typing import Dict, Any, Optional
-from pathlib import Path
 import itertools
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import typer
+from thinc.api import (
+    Model,
+    data_validation,
+    fix_random_seed,
+    set_dropout_rate,
+    set_gpu_allocator,
+)
+from wasabi import msg

 from spacy.training import Example
 from spacy.util import resolve_dot_names
-from wasabi import msg
-from thinc.api import fix_random_seed, set_dropout_rate
-from thinc.api import Model, data_validation, set_gpu_allocator
-import typer

-from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list, setup_gpu
+from .. import util
 from ..schemas import ConfigSchemaTraining
 from ..util import registry
-from .. import util
+from ._util import (
+    Arg,
+    Opt,
+    debug_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+    string_to_list,
+)


@debug_cli.command(
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,14 +1,14 @@
-from typing import Optional, Sequence
-import requests
 import sys
-from wasabi import msg
-import typer
+from typing import Optional, Sequence
+
+import requests
+import typer
+from wasabi import msg

-from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
 from ..errors import OLD_MODEL_SHORTCUTS
+from ..util import get_minor_version, is_package, is_prerelease_version, run_command
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app


@app.command(
@ -81,11 +81,8 @@ def download(

 def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
    dl_tpl = "{m}-{v}/{m}-{v}{s}"
-    egg_tpl = "#egg={m}=={v}"
    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
    filename = dl_tpl.format(m=model_name, v=version, s=suffix)
-    if sdist:
-        filename += egg_tpl.format(m=model_name, v=version)
    return filename


--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -1,16 +1,16 @@
-from typing import Optional, List, Dict, Any, Union
-from wasabi import Printer
-from pathlib import Path
 import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
 import srsly
 from thinc.api import fix_random_seed
+from wasabi import Printer

-from ..training import Corpus
-from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
+from .. import displacy, util
 from ..scorer import Scorer
-from .. import util
-from .. import displacy
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu


@benchmark_cli.command(
@ -27,6 +27,7 @@ def evaluate_cli(
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
    # fmt: on
 ):
    """
@ -50,6 +51,7 @@ def evaluate_cli(
        gold_preproc=gold_preproc,
        displacy_path=displacy_path,
        displacy_limit=displacy_limit,
+        per_component=per_component,
        silent=False,
    )

@ -64,6 +66,7 @@ def evaluate(
    displacy_limit: int = 25,
    silent: bool = True,
    spans_key: str = "sc",
+    per_component: bool = False,
 ) -> Dict[str, Any]:
    msg = Printer(no_print=silent, pretty=not silent)
    fix_random_seed()
@ -78,50 +81,61 @@ def evaluate(
    corpus = Corpus(data_path, gold_preproc=gold_preproc)
    nlp = util.load_model(model)
    dev_dataset = list(corpus(nlp))
-    scores = nlp.evaluate(dev_dataset)
-    metrics = {
-        "TOK": "token_acc",
-        "TAG": "tag_acc",
-        "POS": "pos_acc",
-        "MORPH": "morph_acc",
-        "LEMMA": "lemma_acc",
-        "UAS": "dep_uas",
-        "LAS": "dep_las",
-        "NER P": "ents_p",
-        "NER R": "ents_r",
-        "NER F": "ents_f",
-        "TEXTCAT": "cats_score",
-        "SENT P": "sents_p",
-        "SENT R": "sents_r",
-        "SENT F": "sents_f",
-        "SPAN P": f"spans_{spans_key}_p",
-        "SPAN R": f"spans_{spans_key}_r",
-        "SPAN F": f"spans_{spans_key}_f",
-        "SPEED": "speed",
-    }
-    results = {}
-    data = {}
-    for metric, key in metrics.items():
-        if key in scores:
-            if key == "cats_score":
-                metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
-            if isinstance(scores[key], (int, float)):
-                if key == "speed":
-                    results[metric] = f"{scores[key]:.0f}"
+    scores = nlp.evaluate(dev_dataset, per_component=per_component)
+    if per_component:
+        data = scores
+        if output is None:
+            msg.warn(
+                "The per-component option is enabled but there is no output JSON file provided to save the scores to."
+            )
+        else:
+            msg.info("Per-component scores will be saved to output JSON file.")
+    else:
+        metrics = {
+            "TOK": "token_acc",
+            "TAG": "tag_acc",
+            "POS": "pos_acc",
+            "MORPH": "morph_acc",
+            "LEMMA": "lemma_acc",
+            "UAS": "dep_uas",
+            "LAS": "dep_las",
+            "NER P": "ents_p",
+            "NER R": "ents_r",
+            "NER F": "ents_f",
+            "TEXTCAT": "cats_score",
+            "SENT P": "sents_p",
+            "SENT R": "sents_r",
+            "SENT F": "sents_f",
+            "SPAN P": f"spans_{spans_key}_p",
+            "SPAN R": f"spans_{spans_key}_r",
+            "SPAN F": f"spans_{spans_key}_f",
+            "SPEED": "speed",
+        }
+        results = {}
+        data = {}
+        for metric, key in metrics.items():
+            if key in scores:
+                if key == "cats_score":
+                    metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
+                if isinstance(scores[key], (int, float)):
+                    if key == "speed":
+                        results[metric] = f"{scores[key]:.0f}"
+                    else:
+                        results[metric] = f"{scores[key]*100:.2f}"
                else:
-                    results[metric] = f"{scores[key]*100:.2f}"
-            else:
-                results[metric] = "-"
-            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
+                    results[metric] = "-"
+                data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]

-    msg.table(results, title="Results")
-    data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
+        msg.table(results, title="Results")
+        data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)

    if displacy_path:
        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
        docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
        render_deps = "parser" in factory_names
        render_ents = "ner" in factory_names
+        render_spans = "spancat" in factory_names
+
        render_parses(
            docs,
            displacy_path,
@ -129,6 +143,7 @@ def evaluate(
            limit=displacy_limit,
            deps=render_deps,
            ents=render_ents,
+            spans=render_spans,
        )
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)

@ -182,6 +197,7 @@ def render_parses(
    limit: int = 250,
    deps: bool = True,
    ents: bool = True,
+    spans: bool = True,
 ):
    docs[0].user_data["title"] = model_name
    if ents:
@ -195,6 +211,11 @@ def render_parses(
        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
            file_.write(html)

+    if spans:
+        html = displacy.render(docs[:limit], style="span", page=True)
+        with (output_path / "spans.html").open("w", encoding="utf8") as file_:
+            file_.write(html)
+

 def print_prf_per_type(
    msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -1,17 +1,17 @@
 import functools
+import logging
 import operator
 from pathlib import Path
-import logging
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple

 import numpy
 import wasabi.tables

-from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
-from ..errors import Errors
-from ..training import Corpus
-from ._util import app, Arg, Opt, import_code, setup_gpu
 from .. import util
+from ..errors import Errors
+from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
+from ..training import Corpus
+from ._util import Arg, Opt, app, import_code, setup_gpu

 _DEFAULTS = {
    "n_trials": 11,
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,15 +1,15 @@
-from typing import Optional, Dict, Any, Union, List
-import platform
 import json
+import platform
 from pathlib import Path
-from wasabi import Printer, MarkdownRenderer
-import srsly
+from typing import Any, Dict, List, Optional, Union

-from ._util import app, Arg, Opt, string_to_list
-from .download import get_model_filename, get_latest_version
-from .. import util
-from .. import about
+import srsly
+from wasabi import MarkdownRenderer, Printer
+
+from .. import about, util
 from ..compat import importlib_metadata
+from ._util import Arg, Opt, app, string_to_list
+from .download import get_latest_version, get_model_filename


@app.command("info")
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -1,19 +1,26 @@
-from typing import Optional, List, Tuple
+import re
 from enum import Enum
 from pathlib import Path
-from wasabi import Printer, diff_strings
-from thinc.api import Config
+from typing import List, Optional, Tuple
+
 import srsly
-import re
 from jinja2 import Template
+from thinc.api import Config
+from wasabi import Printer, diff_strings

 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code
-
+from ._util import (
+    COMMAND,
+    Arg,
+    Opt,
+    import_code,
+    init_cli,
+    show_validation_error,
+    string_to_list,
+)

 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -1,15 +1,23 @@
-from typing import Optional
 import logging
 from pathlib import Path
-from wasabi import msg
-import typer
+from typing import Optional
+
 import srsly
+import typer
+from wasabi import msg

 from .. import util
-from ..training.initialize import init_nlp, convert_vectors
 from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+    Arg,
+    Opt,
+    import_code,
+    init_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)


@init_cli.command("vectors")
@ -24,6 +32,7 @@ def init_vectors_cli(
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
+    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
    # fmt: on
 ):
    """Convert word vectors for use with spaCy. Will export an nlp object that
@ -42,6 +51,7 @@ def init_vectors_cli(
        prune=prune,
        name=name,
        mode=mode,
+        attr=attr,
    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,18 +1,18 @@
-from typing import Optional, Union, Any, Dict, List, Tuple, cast
-import shutil
-from pathlib import Path
-from wasabi import Printer, MarkdownRenderer, get_raw_input
-from thinc.api import Config
-from collections import defaultdict
-from catalogue import RegistryError
-import srsly
-import sys
 import re
+import shutil
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast

-from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
-from ..schemas import validate, ModelMetaSchema
-from .. import util
-from .. import about
+import srsly
+from catalogue import RegistryError
+from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, get_raw_input
+
+from .. import about, util
+from ..schemas import ModelMetaSchema, validate
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list


@app.command("package")
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,13 +1,21 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
 import re
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg

-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
 from ..training.pretrain import pretrain
 from ..util import load_config
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)


@app.command(
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -1,17 +1,18 @@
-from typing import Optional, Sequence, Union, Iterator
-import tqdm
-from pathlib import Path
-import srsly
 import cProfile
+import itertools
 import pstats
 import sys
-import itertools
-from wasabi import msg, Printer
-import typer
+from pathlib import Path
+from typing import Iterator, Optional, Sequence, Union
+
+import srsly
+import tqdm
+import typer
+from wasabi import Printer, msg

-from ._util import app, debug_cli, Arg, Opt, NAME
 from ..language import Language
 from ..util import load_model
+from ._util import NAME, Arg, Opt, app, debug_cli


@debug_cli.command("profile")
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -1,206 +0,0 @@
-from typing import Any, Dict, Optional
-from pathlib import Path
-from wasabi import msg
-import os
-import re
-import shutil
-import requests
-import typer
-
-from ...util import ensure_path, working_dir
-from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
-from .._util import get_checksum, download_file, git_checkout, get_git_version
-from .._util import SimpleFrozenDict, parse_config_overrides
-
-# Whether assets are extra if `extra` is not set.
-EXTRA_DEFAULT = False
-
-
-@project_cli.command(
-    "assets",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_assets_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
-    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
-    # fmt: on
-):
-    """Fetch project assets like datasets and pretrained weights. Assets are
-    defined in the "assets" section of the project.yml. If a checksum is
-    provided in the project.yml, the file is only downloaded if no local file
-    with the same checksum exists.
-
-    DOCS: https://spacy.io/api/cli#project-assets
-    """
-    overrides = parse_config_overrides(ctx.args)
-    project_assets(
-        project_dir,
-        overrides=overrides,
-        sparse_checkout=sparse_checkout,
-        extra=extra,
-    )
-
-
-def project_assets(
-    project_dir: Path,
-    *,
-    overrides: Dict[str, Any] = SimpleFrozenDict(),
-    sparse_checkout: bool = False,
-    extra: bool = False,
-) -> None:
-    """Fetch assets for a project using DVC if possible.
-
-    project_dir (Path): Path to project directory.
-    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
-                            needed.
-    extra (bool): Whether to download all assets, including those marked as 'extra'.
-    """
-    project_path = ensure_path(project_dir)
-    config = load_project_config(project_path, overrides=overrides)
-    assets = [
-        asset
-        for asset in config.get("assets", [])
-        if extra or not asset.get("extra", EXTRA_DEFAULT)
-    ]
-    if not assets:
-        msg.warn(
-            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
-            exits=0,
-        )
-    msg.info(f"Fetching {len(assets)} asset(s)")
-
-    for asset in assets:
-        dest = (project_dir / asset["dest"]).resolve()
-        checksum = asset.get("checksum")
-        if "git" in asset:
-            git_err = (
-                f"Cloning spaCy project templates requires Git and the 'git' command. "
-                f"Make sure it's installed and that the executable is available."
-            )
-            get_git_version(error=git_err)
-            if dest.exists():
-                # If there's already a file, check for checksum
-                if checksum and checksum == get_checksum(dest):
-                    msg.good(
-                        f"Skipping download with matching checksum: {asset['dest']}"
-                    )
-                    continue
-                else:
-                    if dest.is_dir():
-                        shutil.rmtree(dest)
-                    else:
-                        dest.unlink()
-            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
-                msg.fail(
-                    "A git asset must include 'repo', the repository address.", exits=1
-                )
-            if "path" not in asset["git"] or asset["git"]["path"] is None:
-                msg.fail(
-                    "A git asset must include 'path' - use \"\" to get the entire repository.",
-                    exits=1,
-                )
-            git_checkout(
-                asset["git"]["repo"],
-                asset["git"]["path"],
-                dest,
-                branch=asset["git"].get("branch"),
-                sparse=sparse_checkout,
-            )
-            msg.good(f"Downloaded asset {dest}")
-        else:
-            url = asset.get("url")
-            if not url:
-                # project.yml defines asset without URL that the user has to place
-                check_private_asset(dest, checksum)
-                continue
-            fetch_asset(project_path, url, dest, checksum)
-
-
-def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
-    """Check and validate assets without a URL (private assets that the user
-    has to provide themselves) and give feedback about the checksum.
-
-    dest (Path): Destination path of the asset.
-    checksum (Optional[str]): Optional checksum of the expected file.
-    """
-    if not Path(dest).exists():
-        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
-        msg.warn(err)
-    else:
-        if not checksum:
-            msg.good(f"Asset already exists: {dest}")
-        elif checksum == get_checksum(dest):
-            msg.good(f"Asset exists with matching checksum: {dest}")
-        else:
-            msg.fail(f"Asset available but with incorrect checksum: {dest}")
-
-
-def fetch_asset(
-    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> None:
-    """Fetch an asset from a given URL or path. If a checksum is provided and a
-    local file exists, it's only re-downloaded if the checksum doesn't match.
-
-    project_path (Path): Path to project directory.
-    url (str): URL or path to asset.
-    checksum (Optional[str]): Optional expected checksum of local file.
-    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
-        the asset failed.
-    """
-    dest_path = (project_path / dest).resolve()
-    if dest_path.exists():
-        # If there's already a file, check for checksum
-        if checksum:
-            if checksum == get_checksum(dest_path):
-                msg.good(f"Skipping download with matching checksum: {dest}")
-                return
-        else:
-            # If there's not a checksum, make sure the file is a possibly valid size
-            if os.path.getsize(dest_path) == 0:
-                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
-                os.remove(dest_path)
-    # We might as well support the user here and create parent directories in
-    # case the asset dir isn't listed as a dir to create in the project.yml
-    if not dest_path.parent.exists():
-        dest_path.parent.mkdir(parents=True)
-    with working_dir(project_path):
-        url = convert_asset_url(url)
-        try:
-            download_file(url, dest_path)
-            msg.good(f"Downloaded asset {dest}")
-        except requests.exceptions.RequestException as e:
-            if Path(url).exists() and Path(url).is_file():
-                # If it's a local file, copy to destination
-                shutil.copy(url, str(dest_path))
-                msg.good(f"Copied local asset {dest}")
-            else:
-                msg.fail(f"Download failed: {dest}", e)
-    if checksum and checksum != get_checksum(dest_path):
-        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
-
-
-def convert_asset_url(url: str) -> str:
-    """Check and convert the asset URL if needed.
-
-    url (str): The asset URL.
-    RETURNS (str): The converted URL.
-    """
-    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if (
-        re.match(r"(http(s?)):\/\/github.com", url)
-        and "releases/download" not in url
-        and "/raw/" not in url
-    ):
-        converted = url.replace("github.com", "raw.githubusercontent.com")
-        converted = re.sub(r"/(tree|blob)/", "/", converted)
-        msg.warn(
-            "Downloading from a regular GitHub URL. This will only download "
-            "the source of the page, not the actual file. Converting the URL "
-            "to a raw URL.",
-            converted,
-        )
-        return converted
-    return url
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -1,115 +0,0 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import subprocess
-import re
-
-from ... import about
-from ...util import ensure_path
-from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_checkout, get_git_version, git_repo_branch_exists
-
-DEFAULT_REPO = about.__projects__
-DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
-DEFAULT_BRANCHES = ["main", "master"]
-
-
-@project_cli.command("clone")
-def project_clone_cli(
-    # fmt: off
-    name: str = Arg(..., help="The name of the template to clone"),
-    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
-    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
-    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
-    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
-    # fmt: on
-):
-    """Clone a project template from a repository. Calls into "git" and will
-    only download the files from the given subdirectory. The GitHub repo
-    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo).
-
-    DOCS: https://spacy.io/api/cli#project-clone
-    """
-    if dest is None:
-        dest = Path.cwd() / Path(name).parts[-1]
-    if repo == DEFAULT_REPO and branch is None:
-        branch = DEFAULT_PROJECTS_BRANCH
-
-    if branch is None:
-        for default_branch in DEFAULT_BRANCHES:
-            if git_repo_branch_exists(repo, default_branch):
-                branch = default_branch
-                break
-        if branch is None:
-            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
-            msg.fail(
-                "No branch provided and attempted default "
-                f"branches {default_branches_msg} do not exist.",
-                exits=1,
-            )
-    else:
-        if not git_repo_branch_exists(repo, branch):
-            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
-    assert isinstance(branch, str)
-    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
-
-
-def project_clone(
-    name: str,
-    dest: Path,
-    *,
-    repo: str = about.__projects__,
-    branch: str = about.__projects_branch__,
-    sparse_checkout: bool = False,
-) -> None:
-    """Clone a project template from a repository.
-
-    name (str): Name of subdirectory to clone.
-    dest (Path): Destination path of cloned project.
-    repo (str): URL of Git repo containing project templates.
-    branch (str): The branch to clone from
-    """
-    dest = ensure_path(dest)
-    check_clone(name, dest, repo)
-    project_dir = dest.resolve()
-    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
-    try:
-        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
-    except subprocess.CalledProcessError:
-        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
-        msg.fail(err, exits=1)
-    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
-    if not (project_dir / PROJECT_FILE).exists():
-        msg.warn(f"No {PROJECT_FILE} found in directory")
-    else:
-        msg.good(f"Your project is now ready!")
-        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
-    """Check and validate that the destination path can be used to clone. Will
-    check that Git is available and that the destination path is suitable.
-
-    name (str): Name of the directory to clone from the repo.
-    dest (Path): Local destination of cloned directory.
-    repo (str): URL of the repo to clone from.
-    """
-    git_err = (
-        f"Cloning spaCy project templates requires Git and the 'git' command. "
-        f"To clone a project without Git, copy the files from the '{name}' "
-        f"directory in the {repo} to {dest} manually."
-    )
-    get_git_version(error=git_err)
-    if not dest:
-        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
-    if dest.exists():
-        # Directory already exists (not allowed, clone needs to create it)
-        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
-    if not dest.parent.exists():
-        # We're not creating parents, parent dir should exist
-        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
-            f"Create the necessary folder(s) first before continuing.",
-            exits=1,
-        )
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@ -1,115 +0,0 @@
-from pathlib import Path
-from wasabi import msg, MarkdownRenderer
-
-from ...util import working_dir
-from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
-
-
-DOCS_URL = "https://spacy.io"
-INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
-project, as well as the available commands and workflows. For details, see the
-[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
-INTRO_COMMANDS = f"""The following commands are defined by the project. They
-can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
-Commands are only re-run if their inputs have changed."""
-INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
-can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
-and will run the specified commands in order. Commands are only re-run if their
-inputs have changed."""
-INTRO_ASSETS = f"""The following assets are defined by the project. They can
-be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
-in the project directory."""
-# These markers are added to the Markdown and can be used to update the file in
-# place if it already exists. Only the auto-generated part will be replaced.
-MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
-MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
-# If this marker is used in an existing README, it's ignored and not replaced
-MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
-
-
-@project_cli.command("document")
-def project_document_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
-    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
-    # fmt: on
-):
-    """
-    Auto-generate a README.md for a project. If the content is saved to a file,
-    hidden markers are added so you can add custom content before or after the
-    auto-generated section and only the auto-generated docs will be replaced
-    when you re-run the command.
-
-    DOCS: https://spacy.io/api/cli#project-document
-    """
-    project_document(project_dir, output_file, no_emoji=no_emoji)
-
-
-def project_document(
-    project_dir: Path, output_file: Path, *, no_emoji: bool = False
-) -> None:
-    is_stdout = str(output_file) == "-"
-    config = load_project_config(project_dir)
-    md = MarkdownRenderer(no_emoji=no_emoji)
-    md.add(MARKER_START)
-    title = config.get("title")
-    description = config.get("description")
-    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
-    if description:
-        md.add(description)
-    md.add(md.title(2, PROJECT_FILE, "📋"))
-    md.add(INTRO_PROJECT)
-    # Commands
-    cmds = config.get("commands", [])
-    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
-    if data:
-        md.add(md.title(3, "Commands", "⏯"))
-        md.add(INTRO_COMMANDS)
-        md.add(md.table(data, ["Command", "Description"]))
-    # Workflows
-    wfs = config.get("workflows", {}).items()
-    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
-    if data:
-        md.add(md.title(3, "Workflows", "⏭"))
-        md.add(INTRO_WORKFLOWS)
-        md.add(md.table(data, ["Workflow", "Steps"]))
-    # Assets
-    assets = config.get("assets", [])
-    data = []
-    for a in assets:
-        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
-        dest_path = a["dest"]
-        dest = md.code(dest_path)
-        if source == "Local":
-            # Only link assets if they're in the repo
-            with working_dir(project_dir) as p:
-                if (p / dest_path).exists():
-                    dest = md.link(dest, dest_path)
-        data.append((dest, source, a.get("description", "")))
-    if data:
-        md.add(md.title(3, "Assets", "🗂"))
-        md.add(INTRO_ASSETS)
-        md.add(md.table(data, ["File", "Source", "Description"]))
-    md.add(MARKER_END)
-    # Output result
-    if is_stdout:
-        print(md.text)
-    else:
-        content = md.text
-        if output_file.exists():
-            with output_file.open("r", encoding="utf8") as f:
-                existing = f.read()
-            if MARKER_IGNORE in existing:
-                msg.warn("Found ignore marker in existing file: skipping", output_file)
-                return
-            if MARKER_START in existing and MARKER_END in existing:
-                msg.info("Found existing file: only replacing auto-generated docs")
-                before = existing.split(MARKER_START)[0]
-                after = existing.split(MARKER_END)[1]
-                content = f"{before}{content}{after}"
-            else:
-                msg.warn("Replacing existing file")
-        with output_file.open("w", encoding="utf8") as f:
-            f.write(content)
-        msg.good("Saved project documentation", output_file)
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -1,207 +0,0 @@
-"""This module contains helpers and subcommands for integrating spaCy projects
-with Data Version Controk (DVC). https://dvc.org"""
-from typing import Dict, Any, List, Optional, Iterable
-import subprocess
-from pathlib import Path
-from wasabi import msg
-
-from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
-from .._util import Arg, Opt, NAME, COMMAND
-from ...util import working_dir, split_command, join_command, run_command
-from ...util import SimpleFrozenList
-
-
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-UPDATE_COMMAND = "dvc"
-DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
-# edited your {PROJECT_FILE}, you can regenerate this file by running:
-# {COMMAND} project {UPDATE_COMMAND}"""
-
-
-@project_cli.command(UPDATE_COMMAND)
-def project_update_dvc_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
-    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
-    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
-    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
-    # fmt: on
-):
-    """Auto-generate Data Version Control (DVC) config. A DVC
-    project can only define one pipeline, so you need to specify one workflow
-    defined in the project.yml. If no workflow is specified, the first defined
-    workflow is used. The DVC config will only be updated if the project.yml
-    changed.
-
-    DOCS: https://spacy.io/api/cli#project-dvc
-    """
-    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
-
-
-def project_update_dvc(
-    project_dir: Path,
-    workflow: Optional[str] = None,
-    *,
-    verbose: bool = False,
-    quiet: bool = False,
-    force: bool = False,
-) -> None:
-    """Update the auto-generated Data Version Control (DVC) config file. A DVC
-    project can only define one pipeline, so you need to specify one workflow
-    defined in the project.yml. Will only update the file if the checksum changed.
-
-    project_dir (Path): The project directory.
-    workflow (Optional[str]): Optional name of workflow defined in project.yml.
-        If not set, the first workflow will be used.
-    verbose (bool): Print more info.
-    quiet (bool): Print less info.
-    force (bool): Force update DVC config.
-    """
-    config = load_project_config(project_dir)
-    updated = update_dvc_config(
-        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
-    )
-    help_msg = "To execute the workflow with DVC, run: dvc repro"
-    if updated:
-        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
-    else:
-        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
-
-
-def update_dvc_config(
-    path: Path,
-    config: Dict[str, Any],
-    workflow: Optional[str] = None,
-    verbose: bool = False,
-    quiet: bool = False,
-    force: bool = False,
-) -> bool:
-    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
-    project directory. The file is auto-generated based on the config. The
-    first line of the auto-generated file specifies the hash of the config
-    dict, so if any of the config values change, the DVC config is regenerated.
-
-    path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project.yml.
-    verbose (bool): Whether to print additional info (via DVC).
-    quiet (bool): Don't output anything (via DVC).
-    force (bool): Force update, even if hashes match.
-    RETURNS (bool): Whether the DVC config file was updated.
-    """
-    ensure_dvc(path)
-    workflows = config.get("workflows", {})
-    workflow_names = list(workflows.keys())
-    check_workflows(workflow_names, workflow)
-    if not workflow:
-        workflow = workflow_names[0]
-    config_hash = get_hash(config)
-    path = path.resolve()
-    dvc_config_path = path / DVC_CONFIG
-    if dvc_config_path.exists():
-        # Check if the file was generated using the current config, if not, redo
-        with dvc_config_path.open("r", encoding="utf8") as f:
-            ref_hash = f.readline().strip().replace("# ", "")
-        if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project.yml, don't need to update
-        dvc_config_path.unlink()
-    dvc_commands = []
-    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-
-    # some flags that apply to every command
-    flags = []
-    if verbose:
-        flags.append("--verbose")
-    if quiet:
-        flags.append("--quiet")
-
-    for name in workflows[workflow]:
-        command = config_commands[name]
-        deps = command.get("deps", [])
-        outputs = command.get("outputs", [])
-        outputs_no_cache = command.get("outputs_no_cache", [])
-        if not deps and not outputs and not outputs_no_cache:
-            continue
-        # Default to the working dir as the project path since dvc.yaml is auto-generated
-        # and we don't want arbitrary paths in there
-        project_cmd = ["python", "-m", NAME, "project", "run", name]
-        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
-        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
-        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-
-        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
-        if command.get("no_skip"):
-            dvc_cmd.append("--always-changed")
-        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        dvc_commands.append(join_command(full_cmd))
-
-    if not dvc_commands:
-        # If we don't check for this, then there will be an error when reading the
-        # config, since DVC wouldn't create it.
-        msg.fail(
-            "No usable commands for DVC found. This can happen if none of your "
-            "commands have dependencies or outputs.",
-            exits=1,
-        )
-
-    with working_dir(path):
-        for c in dvc_commands:
-            dvc_command = "dvc " + c
-            run_command(dvc_command)
-    with dvc_config_path.open("r+", encoding="utf8") as f:
-        content = f.read()
-        f.seek(0, 0)
-        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
-    return True
-
-
-def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
-    """Validate workflows provided in project.yml and check that a given
-    workflow can be used to generate a DVC config.
-
-    workflows (List[str]): Names of the available workflows.
-    workflow (Optional[str]): The name of the workflow to convert.
-    """
-    if not workflows:
-        msg.fail(
-            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
-            f"define at least one list of commands.",
-            exits=1,
-        )
-    if workflow is not None and workflow not in workflows:
-        msg.fail(
-            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
-            f"Available workflows: {', '.join(workflows)}",
-            exits=1,
-        )
-    if not workflow:
-        msg.warn(
-            f"No workflow specified for DVC pipeline. Using the first workflow "
-            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
-        )
-
-
-def ensure_dvc(project_dir: Path) -> None:
-    """Ensure that the "dvc" command is available and that the current project
-    directory is an initialized DVC project.
-    """
-    try:
-        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            "To use spaCy projects with DVC (Data Version Control), DVC needs "
-            "to be installed and the 'dvc' command needs to be available",
-            "You can install the Python package from pip (pip install dvc) or "
-            "conda (conda install -c conda-forge dvc). For more details, see the "
-            "documentation: https://dvc.org/doc/install",
-            exits=1,
-        )
-    if not (project_dir / ".dvc").exists():
-        msg.fail(
-            "Project not initialized as a DVC project",
-            "To initialize a DVC project, you can run 'dvc init' in the project "
-            "directory. For more details, see the documentation: "
-            "https://dvc.org/doc/command-reference/init",
-            exits=1,
-        )
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@ -1,67 +0,0 @@
-from pathlib import Path
-from wasabi import msg
-from .remote_storage import RemoteStorage
-from .remote_storage import get_command_hash
-from .._util import project_cli, Arg, logger
-from .._util import load_project_config
-from .run import update_lockfile
-
-
-@project_cli.command("pull")
-def project_pull_cli(
-    # fmt: off
-    remote: str = Arg("default", help="Name or path of remote storage"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Retrieve available precomputed outputs from a remote storage.
-    You can alias remotes in your project.yml by mapping them to storage paths.
-    A storage can be anything that the smart-open library can upload to, e.g.
-    AWS, Google Cloud Storage, SSH, local directories etc.
-
-    DOCS: https://spacy.io/api/cli#project-pull
-    """
-    for url, output_path in project_pull(project_dir, remote):
-        if url is not None:
-            msg.good(f"Pulled {output_path} from {url}")
-
-
-def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
-    # TODO: We don't have tests for this :(. It would take a bit of mockery to
-    # set up. I guess see if it breaks first?
-    config = load_project_config(project_dir)
-    if remote in config.get("remotes", {}):
-        remote = config["remotes"][remote]
-    storage = RemoteStorage(project_dir, remote)
-    commands = list(config.get("commands", []))
-    # We use a while loop here because we don't know how the commands
-    # will be ordered. A command might need dependencies from one that's later
-    # in the list.
-    while commands:
-        for i, cmd in enumerate(list(commands)):
-            logger.debug("CMD: %s.", cmd["name"])
-            deps = [project_dir / dep for dep in cmd.get("deps", [])]
-            if all(dep.exists() for dep in deps):
-                cmd_hash = get_command_hash("", "", deps, cmd["script"])
-                for output_path in cmd.get("outputs", []):
-                    url = storage.pull(output_path, command_hash=cmd_hash)
-                    logger.debug(
-                        "URL: %s for %s with command hash %s",
-                        url,
-                        output_path,
-                        cmd_hash,
-                    )
-                    yield url, output_path
-
-                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
-                if all(loc.exists() for loc in out_locs):
-                    update_lockfile(project_dir, cmd)
-                # We remove the command from the list here, and break, so that
-                # we iterate over the loop again.
-                commands.pop(i)
-                break
-            else:
-                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
-        else:
-            # If we didn't break the for loop, break the while loop.
-            break
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@ -1,69 +0,0 @@
-from pathlib import Path
-from wasabi import msg
-from .remote_storage import RemoteStorage
-from .remote_storage import get_content_hash, get_command_hash
-from .._util import load_project_config
-from .._util import project_cli, Arg, logger
-
-
-@project_cli.command("push")
-def project_push_cli(
-    # fmt: off
-    remote: str = Arg("default", help="Name or path of remote storage"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Persist outputs to a remote storage. You can alias remotes in your
-    project.yml by mapping them to storage paths. A storage can be anything that
-    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
-    local directories etc.
-
-    DOCS: https://spacy.io/api/cli#project-push
-    """
-    for output_path, url in project_push(project_dir, remote):
-        if url is None:
-            msg.info(f"Skipping {output_path}")
-        else:
-            msg.good(f"Pushed {output_path} to {url}")
-
-
-def project_push(project_dir: Path, remote: str):
-    """Persist outputs to a remote storage. You can alias remotes in your project.yml
-    by mapping them to storage paths. A storage can be anything that the smart-open
-    library can upload to, e.g. gcs, aws, ssh, local directories etc
-    """
-    config = load_project_config(project_dir)
-    if remote in config.get("remotes", {}):
-        remote = config["remotes"][remote]
-    storage = RemoteStorage(project_dir, remote)
-    for cmd in config.get("commands", []):
-        logger.debug("CMD: %s", cmd["name"])
-        deps = [project_dir / dep for dep in cmd.get("deps", [])]
-        if any(not dep.exists() for dep in deps):
-            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
-            continue
-        cmd_hash = get_command_hash(
-            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
-        )
-        logger.debug("CMD_HASH: %s", cmd_hash)
-        for output_path in cmd.get("outputs", []):
-            output_loc = project_dir / output_path
-            if output_loc.exists() and _is_not_empty_dir(output_loc):
-                url = storage.push(
-                    output_path,
-                    command_hash=cmd_hash,
-                    content_hash=get_content_hash(output_loc),
-                )
-                logger.debug(
-                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
-                )
-                yield output_path, url
-
-
-def _is_not_empty_dir(loc: Path):
-    if not loc.is_dir():
-        return True
-    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
-        return True
-    else:
-        return False
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@ -1,205 +0,0 @@
-from typing import Optional, List, Dict, TYPE_CHECKING
-import os
-import site
-import hashlib
-import urllib.parse
-import tarfile
-from pathlib import Path
-from wasabi import msg
-
-from .._util import get_hash, get_checksum, upload_file, download_file
-from .._util import ensure_pathy, make_tempdir
-from ...util import get_minor_version, ENV_VARS, check_bool_env_var
-from ...git_info import GIT_VERSION
-from ... import about
-from ...errors import Errors
-
-if TYPE_CHECKING:
-    from pathy import FluidPath  # noqa: F401
-
-
-class RemoteStorage:
-    """Push and pull outputs to and from a remote file storage.
-
-    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
-    ssh, etc.
-    """
-
-    def __init__(self, project_root: Path, url: str, *, compression="gz"):
-        self.root = project_root
-        self.url = ensure_pathy(url)
-        self.compression = compression
-
-    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
-        """Compress a file or directory within a project and upload it to a remote
-        storage. If an object exists at the full URL, nothing is done.
-
-        Within the remote storage, files are addressed by their project path
-        (url encoded) and two user-supplied hashes, representing their creation
-        context and their file contents. If the URL already exists, the data is
-        not uploaded. Paths are archived and compressed prior to upload.
-        """
-        loc = self.root / path
-        if not loc.exists():
-            raise IOError(f"Cannot push {loc}: does not exist.")
-        url = self.make_url(path, command_hash, content_hash)
-        if url.exists():
-            return url
-        tmp: Path
-        with make_tempdir() as tmp:
-            tar_loc = tmp / self.encode_name(str(path))
-            mode_string = f"w:{self.compression}" if self.compression else "w"
-            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
-                tar_file.add(str(loc), arcname=str(path))
-            upload_file(tar_loc, url)
-        return url
-
-    def pull(
-        self,
-        path: Path,
-        *,
-        command_hash: Optional[str] = None,
-        content_hash: Optional[str] = None,
-    ) -> Optional["FluidPath"]:
-        """Retrieve a file from the remote cache. If the file already exists,
-        nothing is done.
-
-        If the command_hash and/or content_hash are specified, only matching
-        results are returned. If no results are available, an error is raised.
-        """
-        dest = self.root / path
-        if dest.exists():
-            return None
-        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
-        if url is None:
-            return url
-        else:
-            # Make sure the destination exists
-            if not dest.parent.exists():
-                dest.parent.mkdir(parents=True)
-            tmp: Path
-            with make_tempdir() as tmp:
-                tar_loc = tmp / url.parts[-1]
-                download_file(url, tar_loc)
-                mode_string = f"r:{self.compression}" if self.compression else "r"
-                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
-                    # This requires that the path is added correctly, relative
-                    # to root. This is how we set things up in push()
-
-                    # Disallow paths outside the current directory for the tar
-                    # file (CVE-2007-4559, directory traversal vulnerability)
-                    def is_within_directory(directory, target):
-                        abs_directory = os.path.abspath(directory)
-                        abs_target = os.path.abspath(target)
-                        prefix = os.path.commonprefix([abs_directory, abs_target])
-                        return prefix == abs_directory
-
-                    def safe_extract(tar, path):
-                        for member in tar.getmembers():
-                            member_path = os.path.join(path, member.name)
-                            if not is_within_directory(path, member_path):
-                                raise ValueError(Errors.E852)
-                        tar.extractall(path)
-
-                    safe_extract(tar_file, self.root)
-        return url
-
-    def find(
-        self,
-        path: Path,
-        *,
-        command_hash: Optional[str] = None,
-        content_hash: Optional[str] = None,
-    ) -> Optional["FluidPath"]:
-        """Find the best matching version of a file within the storage,
-        or `None` if no match can be found. If both the creation and content hash
-        are specified, only exact matches will be returned. Otherwise, the most
-        recent matching file is preferred.
-        """
-        name = self.encode_name(str(path))
-        urls = []
-        if command_hash is not None and content_hash is not None:
-            url = self.url / name / command_hash / content_hash
-            urls = [url] if url.exists() else []
-        elif command_hash is not None:
-            if (self.url / name / command_hash).exists():
-                urls = list((self.url / name / command_hash).iterdir())
-        else:
-            if (self.url / name).exists():
-                for sub_dir in (self.url / name).iterdir():
-                    urls.extend(sub_dir.iterdir())
-                if content_hash is not None:
-                    urls = [url for url in urls if url.parts[-1] == content_hash]
-        if len(urls) >= 2:
-            try:
-                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
-            except Exception:
-                msg.warn(
-                    "Unable to sort remote files by last modified. The file(s) "
-                    "pulled from the cache may not be the most recent."
-                )
-        return urls[-1] if urls else None
-
-    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
-        """Construct a URL from a subpath, a creation hash and a content hash."""
-        return self.url / self.encode_name(str(path)) / command_hash / content_hash
-
-    def encode_name(self, name: str) -> str:
-        """Encode a subpath into a URL-safe name."""
-        return urllib.parse.quote_plus(name)
-
-
-def get_content_hash(loc: Path) -> str:
-    return get_checksum(loc)
-
-
-def get_command_hash(
-    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
-) -> str:
-    """Create a hash representing the execution of a command. This includes the
-    currently installed packages, whatever environment variables have been marked
-    as relevant, and the command.
-    """
-    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
-        spacy_v = GIT_VERSION
-    else:
-        spacy_v = str(get_minor_version(about.__version__) or "")
-    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
-    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
-    hashes.extend(cmd)
-    creation_bytes = "".join(hashes).encode("utf8")
-    return hashlib.md5(creation_bytes).hexdigest()
-
-
-def get_site_hash():
-    """Hash the current Python environment's site-packages contents, including
-    the name and version of the libraries. The list we're hashing is what
-    `pip freeze` would output.
-    """
-    site_dirs = site.getsitepackages()
-    if site.ENABLE_USER_SITE:
-        site_dirs.extend(site.getusersitepackages())
-    packages = set()
-    for site_dir in site_dirs:
-        site_dir = Path(site_dir)
-        for subpath in site_dir.iterdir():
-            if subpath.parts[-1].endswith("dist-info"):
-                packages.add(subpath.parts[-1].replace(".dist-info", ""))
-    package_bytes = "".join(sorted(packages)).encode("utf8")
-    return hashlib.md5sum(package_bytes).hexdigest()
-
-
-def get_env_hash(env: Dict[str, str]) -> str:
-    """Construct a hash of the environment variables that will be passed into
-    the commands.
-
-    Values in the env dict may be references to the current os.environ, using
-    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
-    """
-    env_vars = {}
-    for key, value in env.items():
-        if value.startswith("$"):
-            env_vars[key] = os.environ.get(value[1:], "")
-        else:
-            env_vars[key] = value
-    return get_hash(env_vars)
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -1,360 +0,0 @@
-from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
-import os.path
-from pathlib import Path
-
-from wasabi import msg
-from wasabi.util import locale_escape
-import sys
-import srsly
-import typer
-
-from ... import about
-from ...git_info import GIT_VERSION
-from ...util import working_dir, run_command, split_command, is_cwd, join_command
-from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
-from ...util import check_bool_env_var, SimpleFrozenDict
-from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
-from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
-
-
-@project_cli.command(
-    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
-)
-def project_run_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
-    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run a named command or workflow defined in the project.yml. If a workflow
-    name is specified, all commands in the workflow are run, in order. If
-    commands define dependencies and/or outputs, they will only be re-run if
-    state has changed.
-
-    DOCS: https://spacy.io/api/cli#project-run
-    """
-    if show_help or not subcommand:
-        print_run_help(project_dir, subcommand)
-    else:
-        overrides = parse_config_overrides(ctx.args)
-        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
-
-
-def project_run(
-    project_dir: Path,
-    subcommand: str,
-    *,
-    overrides: Dict[str, Any] = SimpleFrozenDict(),
-    force: bool = False,
-    dry: bool = False,
-    capture: bool = False,
-    skip_requirements_check: bool = False,
-) -> None:
-    """Run a named script defined in the project.yml. If the script is part
-    of the default pipeline (defined in the "run" section), DVC is used to
-    execute the command, so it can determine whether to rerun it. It then
-    calls into "exec" to execute it.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    overrides (Dict[str, Any]): Optional config overrides.
-    force (bool): Force re-running, even if nothing changed.
-    dry (bool): Perform a dry run and don't execute commands.
-    capture (bool): Whether to capture the output and errors of individual commands.
-        If False, the stdout and stderr will not be redirected, and if there's an error,
-        sys.exit will be called with the return code. You should use capture=False
-        when you want to turn over execution to the command, and capture=True
-        when you want to run the command more like a function.
-    skip_requirements_check (bool): Whether to skip the requirements check.
-    """
-    config = load_project_config(project_dir, overrides=overrides)
-    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    workflows = config.get("workflows", {})
-    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
-
-    req_path = project_dir / "requirements.txt"
-    if not skip_requirements_check:
-        if config.get("check_requirements", True) and os.path.exists(req_path):
-            with req_path.open() as requirements_file:
-                _check_requirements([req.strip() for req in requirements_file])
-
-    if subcommand in workflows:
-        msg.info(f"Running workflow '{subcommand}'")
-        for cmd in workflows[subcommand]:
-            project_run(
-                project_dir,
-                cmd,
-                overrides=overrides,
-                force=force,
-                dry=dry,
-                capture=capture,
-                skip_requirements_check=True,
-            )
-    else:
-        cmd = commands[subcommand]
-        for dep in cmd.get("deps", []):
-            if not (project_dir / dep).exists():
-                err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
-                err_exits = 1 if not dry else None
-                msg.fail(err, err_help, exits=err_exits)
-        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
-        with working_dir(project_dir) as current_dir:
-            msg.divider(subcommand)
-            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
-            if not rerun and not force:
-                msg.info(f"Skipping '{cmd['name']}': nothing changed")
-            else:
-                run_commands(cmd["script"], dry=dry, capture=capture)
-                if not dry:
-                    update_lockfile(current_dir, cmd)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project.yml.
-
-    project_dir (Path): The project directory.
-    subcommand (Optional[str]): The subcommand or None. If a subcommand is
-        provided, the subcommand help is shown. Otherwise, the top-level help
-        and a list of available commands is printed.
-    """
-    config = load_project_config(project_dir)
-    config_commands = config.get("commands", [])
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    workflows = config.get("workflows", {})
-    project_loc = "" if is_cwd(project_dir) else project_dir
-    if subcommand:
-        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
-        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
-        if subcommand in commands:
-            help_text = commands[subcommand].get("help")
-            if help_text:
-                print(f"\n{help_text}\n")
-        elif subcommand in workflows:
-            steps = workflows[subcommand]
-            print(f"\nWorkflow consisting of {len(steps)} commands:")
-            steps_data = [
-                (f"{i + 1}. {step}", commands[step].get("help", ""))
-                for i, step in enumerate(steps)
-            ]
-            msg.table(steps_data)
-            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
-            print(f"For command details, run: {help_cmd}")
-    else:
-        print("")
-        title = config.get("title")
-        if title:
-            print(f"{locale_escape(title)}\n")
-        if config_commands:
-            print(f"Available commands in {PROJECT_FILE}")
-            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
-            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        if workflows:
-            print(f"Available workflows in {PROJECT_FILE}")
-            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
-            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
-
-
-def run_commands(
-    commands: Iterable[str] = SimpleFrozenList(),
-    silent: bool = False,
-    dry: bool = False,
-    capture: bool = False,
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    silent (bool): Don't print the commands.
-    dry (bool): Perform a dry run and don't execut anything.
-    capture (bool): Whether to capture the output and errors of individual commands.
-        If False, the stdout and stderr will not be redirected, and if there's an error,
-        sys.exit will be called with the return code. You should use capture=False
-        when you want to turn over execution to the command, and capture=True
-        when you want to run the command more like a function.
-    """
-    for c in commands:
-        command = split_command(c)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {join_command(command)}")
-        if not dry:
-            run_command(command, capture=capture)
-
-
-def validate_subcommand(
-    commands: Sequence[str], workflows: Sequence[str], subcommand: str
-) -> None:
-    """Check that a subcommand is valid and defined. Raises an error otherwise.
-
-    commands (Sequence[str]): The available commands.
-    subcommand (str): The subcommand.
-    """
-    if not commands and not workflows:
-        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
-    if subcommand not in commands and subcommand not in workflows:
-        help_msg = []
-        if subcommand in ["assets", "asset"]:
-            help_msg.append("Did you mean to run: python -m spacy project assets?")
-        if commands:
-            help_msg.append(f"Available commands: {', '.join(commands)}")
-        if workflows:
-            help_msg.append(f"Available workflows: {', '.join(workflows)}")
-        msg.fail(
-            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
-            ". ".join(help_msg),
-            exits=1,
-        )
-
-
-def check_rerun(
-    project_dir: Path,
-    command: Dict[str, Any],
-    *,
-    check_spacy_version: bool = True,
-    check_spacy_commit: bool = False,
-) -> bool:
-    """Check if a command should be rerun because its settings or inputs/outputs
-    changed.
-
-    project_dir (Path): The current project directory.
-    command (Dict[str, Any]): The command, as defined in the project.yml.
-    strict_version (bool):
-    RETURNS (bool): Whether to re-run the command.
-    """
-    # Always rerun if no-skip is set
-    if command.get("no_skip", False):
-        return True
-    lock_path = project_dir / PROJECT_LOCK
-    if not lock_path.exists():  # We don't have a lockfile, run command
-        return True
-    data = srsly.read_yaml(lock_path)
-    if command["name"] not in data:  # We don't have info about this command
-        return True
-    entry = data[command["name"]]
-    # Always run commands with no outputs (otherwise they'd always be skipped)
-    if not entry.get("outs", []):
-        return True
-    # Always rerun if spaCy version or commit hash changed
-    spacy_v = entry.get("spacy_version")
-    commit = entry.get("spacy_git_version")
-    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
-        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
-        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
-        return True
-    if check_spacy_commit and commit != GIT_VERSION:
-        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
-        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
-        return True
-    # If the entry in the lockfile matches the lockfile entry that would be
-    # generated from the current command, we don't rerun because it means that
-    # all inputs/outputs, hashes and scripts are the same and nothing changed
-    lock_entry = get_lock_entry(project_dir, command)
-    exclude = ["spacy_version", "spacy_git_version"]
-    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
-
-
-def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
-    """Update the lockfile after running a command. Will create a lockfile if
-    it doesn't yet exist and will add an entry for the current command, its
-    script and dependencies/outputs.
-
-    project_dir (Path): The current project directory.
-    command (Dict[str, Any]): The command, as defined in the project.yml.
-    """
-    lock_path = project_dir / PROJECT_LOCK
-    if not lock_path.exists():
-        srsly.write_yaml(lock_path, {})
-        data = {}
-    else:
-        data = srsly.read_yaml(lock_path)
-    data[command["name"]] = get_lock_entry(project_dir, command)
-    srsly.write_yaml(lock_path, data)
-
-
-def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
-    """Get a lockfile entry for a given command. An entry includes the command,
-    the script (command steps) and a list of dependencies and outputs with
-    their paths and file hashes, if available. The format is based on the
-    dvc.lock files, to keep things consistent.
-
-    project_dir (Path): The current project directory.
-    command (Dict[str, Any]): The command, as defined in the project.yml.
-    RETURNS (Dict[str, Any]): The lockfile entry.
-    """
-    deps = get_fileinfo(project_dir, command.get("deps", []))
-    outs = get_fileinfo(project_dir, command.get("outputs", []))
-    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
-    return {
-        "cmd": f"{COMMAND} run {command['name']}",
-        "script": command["script"],
-        "deps": deps,
-        "outs": [*outs, *outs_nc],
-        "spacy_version": about.__version__,
-        "spacy_git_version": GIT_VERSION,
-    }
-
-
-def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
-    """Generate the file information for a list of paths (dependencies, outputs).
-    Includes the file path and the file's checksum.
-
-    project_dir (Path): The current project directory.
-    paths (List[str]): The file paths.
-    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
-    """
-    data = []
-    for path in paths:
-        file_path = project_dir / path
-        md5 = get_checksum(file_path) if file_path.exists() else None
-        data.append({"path": path, "md5": md5})
-    return data
-
-
-def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
-    """Checks whether requirements are installed and free of version conflicts.
-    requirements (List[str]): List of requirements.
-    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
-        exist.
-    """
-    import pkg_resources
-
-    failed_pkgs_msgs: List[str] = []
-    conflicting_pkgs_msgs: List[str] = []
-
-    for req in requirements:
-        try:
-            pkg_resources.require(req)
-        except pkg_resources.DistributionNotFound as dnf:
-            failed_pkgs_msgs.append(dnf.report())
-        except pkg_resources.VersionConflict as vc:
-            conflicting_pkgs_msgs.append(vc.report())
-        except Exception:
-            msg.warn(
-                f"Unable to check requirement: {req} "
-                "Checks are currently limited to requirement specifiers "
-                "(PEP 508)"
-            )
-
-    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
-        msg.warn(
-            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
-            "correctly and you installed all requirements specified in your project's requirements.txt: "
-        )
-        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
-            msg.text(pgk_msg)
-
-    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
 {%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
-{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
 [paths]
 train = null
 dev = null
@ -28,7 +28,7 @@ lang = "{{ lang }}"
 tok2vec/transformer. #}
 {%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
 {%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
 {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 {%- else -%}
 {%- set full_pipeline = components -%}
@ -127,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 {% endif -%}

+{% if "span_finder" in components -%}
+[components.span_finder]
+factory = "span_finder"
+max_length = 25
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.span_finder.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
 {% if "spancat" in components -%}
 [components.spancat]
 factory = "spancat"
@ -392,6 +416,27 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}

+{% if "span_finder" in components %}
+[components.span_finder]
+factory = "span_finder"
+max_length = 25
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
 {% if "spancat" in components %}
 [components.spancat]
 factory = "spancat"
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,15 +1,23 @@
-from typing import Optional, Dict, Any, Union
-from pathlib import Path
-from wasabi import msg
-import typer
 import logging
 import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import typer
+from wasabi import msg

-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-from ..training.loop import train as train_nlp
-from ..training.initialize import init_nlp
 from .. import util
+from ..training.initialize import init_nlp
+from ..training.loop import train as train_nlp
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)


@app.command(
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -1,14 +1,21 @@
-from typing import Tuple
-from pathlib import Path
 import sys
-import requests
-from wasabi import msg, Printer
 import warnings
+from pathlib import Path
+from typing import Tuple
+
+import requests
+from wasabi import Printer, msg

-from ._util import app
 from .. import about
-from ..util import get_package_version, get_installed_models, get_minor_version
-from ..util import get_package_path, get_model_meta, is_compatible_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_model_meta,
+    get_package_path,
+    get_package_version,
+    is_compatible_version,
+)
+from ._util import app


@app.command("validate")
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,5 +1,6 @@
 """Helpers for Python and platform compatibility."""
 import sys
+
 from thinc.util import copy_array

 try:
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -26,6 +26,9 @@ batch_size = 1000
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"

+[nlp.vectors]
+@vectors = "spacy.Vectors.v1"
+
 # The pipeline components and their models
 [components]

--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from typing import Union, Iterable, Optional, Dict, Any, Callable
 import warnings
+from typing import Any, Callable, Dict, Iterable, Optional, Union

-from .render import DependencyRenderer, EntityRenderer, SpanRenderer
-from ..tokens import Doc, Span
 from ..errors import Errors, Warnings
-from ..util import is_in_jupyter
-from ..util import find_available_port
-
+from ..tokens import Doc, Span
+from ..util import find_available_port, is_in_jupyter
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer

 _html = {}
 RENDER_WRAPPER = None
@ -68,7 +66,7 @@ def render(
    if jupyter or (jupyter is None and is_in_jupyter()):
        # return HTML rendered by IPython display()
        # See #4840 for details on span wrapper to disable mathjax
-        from IPython.core.display import display, HTML
+        from IPython.core.display import HTML, display

        return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
    return html
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,15 +1,28 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
 import uuid
-import itertools
+from typing import Any, Dict, List, Optional, Tuple, Union

 from ..errors import Errors
 from ..util import escape_html, minify_html, registry
-from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
-from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
-from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
-from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
-from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
-from .templates import TPL_TITLE
+from .templates import (
+    TPL_DEP_ARCS,
+    TPL_DEP_SVG,
+    TPL_DEP_WORDS,
+    TPL_DEP_WORDS_LEMMA,
+    TPL_ENT,
+    TPL_ENT_RTL,
+    TPL_ENTS,
+    TPL_FIGURE,
+    TPL_KB_LINK,
+    TPL_PAGE,
+    TPL_SPAN,
+    TPL_SPAN_RTL,
+    TPL_SPAN_SLICE,
+    TPL_SPAN_SLICE_RTL,
+    TPL_SPAN_START,
+    TPL_SPAN_START_RTL,
+    TPL_SPANS,
+    TPL_TITLE,
+)

 DEFAULT_LANG = "en"
 DEFAULT_DIR = "ltr"
@ -204,7 +217,7 @@ class SpanRenderer:
                    + (self.offset_step * (len(entities) - 1))
                )
                markup += self.span_template.format(
-                    text=token["text"],
+                    text=escape_html(token["text"]),
                    span_slices=slices,
                    span_starts=starts,
                    total_height=total_height,
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,4 +1,5 @@
 import warnings
+
 from .compat import Literal


@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
+    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
+            "key attribute for vectors, configure it through Vectors(attr=) or "
+            "'spacy init vectors --attr'")


 class Errors(metaclass=ErrorsWithCodes):
@ -549,12 +553,12 @@ class Errors(metaclass=ErrorsWithCodes):
            "during training, make sure to include it in 'annotating components'")

    # New errors added in v3.x
+    E849 = ("The vocab only supports {method} for vectors of type "
+            "spacy.vectors.Vectors, not {vectors_type}.")
    E850 = ("The PretrainVectors objective currently only supports default or "
            "floret vectors, not {mode} vectors.")
    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
            "but found value of '{val}'.")
-    E852 = ("The tar file pulled from the remote attempted an unsafe path "
-            "traversal.")
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
            "not permitted in factory names.")
    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
@ -738,8 +742,8 @@ class Errors(metaclass=ErrorsWithCodes):
            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
            "load the model, use its full name instead:\n\n"
            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models. If you "
-            "want to create a blank model, use spacy.blank: "
+            "models, see the models directory: https://spacy.io/models and if "
+            "you want to create a blank model, use spacy.blank: "
            "nlp = spacy.blank(\"{name}\")")
    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
            "return an initialized nlp object but got: {value}. Maybe "
@ -970,6 +974,15 @@ class Errors(metaclass=ErrorsWithCodes):
    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
             "or use `auto_select_port=True` to pick an available port automatically.")
    E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
+    E1052 = ("Unable to copy spans: the character offsets for the span at "
+             "index {i} in the span group do not align with the tokenization "
+             "in the target doc.")
+    E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
+             " 'min_length': {min_length}, 'max_length': {max_length}")
+    E1054 = ("The text, including whitespace, must match between reference and "
+             "predicted docs when training {component}.")
+    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
+             "but only callbacks with one or three parameters are supported")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -1,4 +1,5 @@
 import warnings
+
 from .errors import Warnings


--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -1,3 +1,3 @@
+from .candidate import Candidate, get_candidates, get_candidates_batch
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, get_candidates, get_candidates_batch
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -1,8 +1,11 @@
-from .kb cimport KnowledgeBase
 from libcpp.vector cimport vector
-from ..typedefs cimport hash_t

-# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+from ..typedefs cimport hash_t
+from .kb cimport KnowledgeBase
+
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate
+# combination.
 cdef class Candidate:
    cdef readonly KnowledgeBase kb
    cdef hash_t entity_hash
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -1,19 +1,31 @@
 # cython: infer_types=True, profile=True

 from typing import Iterable
+
 from .kb cimport KnowledgeBase
+
 from ..tokens import Span

+
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
-    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
-    algorithm which will disambiguate the various candidates to the correct one.
+    """A `Candidate` object refers to a textual mention (`alias`) that may or
+    may not be resolved to a specific `entity` from a Knowledge Base. This
+    will be used as input for the entity linking algorithm which will
+    disambiguate the various candidates to the correct one.
    Each candidate (alias, entity) pair is assigned a certain prior probability.

    DOCS: https://spacy.io/api/kb/#candidate-init
    """

-    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+    def __init__(
+        self,
+        KnowledgeBase kb,
+        entity_hash,
+        entity_freq,
+        entity_vector,
+        alias_hash,
+        prior_prob
+    ):
        self.kb = kb
        self.entity_hash = entity_hash
        self.entity_freq = entity_freq
@ -56,7 +68,8 @@ cdef class Candidate:

 def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
    """
-    Return candidate entities for a given mention and fetching appropriate entries from the index.
+    Return candidate entities for a given mention and fetching appropriate
+    entries from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Span): Entity mention for which to identify candidates.
    RETURNS (Iterable[Candidate]): Identified candidates.
@ -64,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
    return kb.get_candidates(mention)


-def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+def get_candidates_batch(
+        kb: KnowledgeBase, mentions: Iterable[Span]
+) -> Iterable[Iterable[Candidate]]:
    """
-    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+    Return candidate entities for the given mentions and fetching appropriate entries
+    from the index.
    kb (KnowledgeBase): Knowledge base to query.
    mention (Iterable[Span]): Entity mentions for which to identify candidates.
    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
--- a/spacy/kb/kb.pxd
+++ b/spacy/kb/kb.pxd
@ -2,8 +2,10 @@

 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t
+
 from ..vocab cimport Vocab

+
 cdef class KnowledgeBase:
    cdef Pool mem
    cdef readonly Vocab vocab
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -2,17 +2,19 @@

 from pathlib import Path
 from typing import Iterable, Tuple, Union
+
 from cymem.cymem cimport Pool

-from .candidate import Candidate
+from ..errors import Errors
 from ..tokens import Span
 from ..util import SimpleFrozenList
-from ..errors import Errors
+from .candidate import Candidate


 cdef class KnowledgeBase:
-    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
-    to support entity linking of named entities to real-world concepts.
+    """A `KnowledgeBase` instance stores unique identifiers for entities and
+    their textual aliases, to support entity linking of named entities to
+    real-world concepts.
    This is an abstract class and requires its operations to be implemented.

    DOCS: https://spacy.io/api/kb
@ -30,10 +32,13 @@ cdef class KnowledgeBase:
        self.entity_vector_length = entity_vector_length
        self.mem = Pool()

-    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+    def get_candidates_batch(
+        self, mentions: Iterable[Span]
+    ) -> Iterable[Iterable[Candidate]]:
        """
-        Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
-        and the prior probability of that alias resolving to that entity.
+        Return candidate entities for specified texts. Each candidate defines
+        the entity, the original alias, and the prior probability of that
+        alias resolving to that entity.
        If no candidate is found for a given text, an empty list is returned.
        mentions (Iterable[Span]): Mentions for which to get candidates.
        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
@ -42,14 +47,17 @@ cdef class KnowledgeBase:

    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
        """
-        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+        Return candidate entities for specified text. Each candidate defines
+        the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
        If the no candidate is found for a given text, an empty list is returned.
        mention (Span): Mention for which to get candidates.
        RETURNS (Iterable[Candidate]): Identified candidates.
        """
        raise NotImplementedError(
-            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
+            Errors.E1045.format(
+                parent="KnowledgeBase", method="get_candidates", name=self.__name__
+            )
        )

    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
@ -67,7 +75,9 @@ cdef class KnowledgeBase:
        RETURNS (Iterable[float]): Vector for specified entity.
        """
        raise NotImplementedError(
-            Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
+            Errors.E1045.format(
+                parent="KnowledgeBase", method="get_vector", name=self.__name__
+            )
        )

    def to_bytes(self, **kwargs) -> bytes:
@ -75,7 +85,9 @@ cdef class KnowledgeBase:
        RETURNS (bytes): Current state as binary string.
        """
        raise NotImplementedError(
-            Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
+            Errors.E1045.format(
+                parent="KnowledgeBase", method="to_bytes", name=self.__name__
+            )
        )

    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
@ -84,25 +96,35 @@ cdef class KnowledgeBase:
        exclude (Tuple[str]): Properties to exclude when restoring KB.
        """
        raise NotImplementedError(
-            Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
+            Errors.E1045.format(
+                parent="KnowledgeBase", method="from_bytes", name=self.__name__
+            )
        )

-    def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+    def to_disk(
+            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
+    ) -> None:
        """
        Write KnowledgeBase content to disk.
        path (Union[str, Path]): Target file path.
        exclude (Iterable[str]): List of components to exclude.
        """
        raise NotImplementedError(
-            Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
+            Errors.E1045.format(
+                parent="KnowledgeBase", method="to_disk", name=self.__name__
+            )
        )

-    def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+    def from_disk(
+            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
+    ) -> None:
        """
        Load KnowledgeBase content from disk.
        path (Union[str, Path]): Target file path.
        exclude (Iterable[str]): List of components to exclude.
        """
        raise NotImplementedError(
-            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
+            Errors.E1045.format(
+                parent="KnowledgeBase", method="from_disk", name=self.__name__
+            )
        )
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@ -1,11 +1,11 @@
 """Knowledge-base for entity or concept linking."""
-from preshed.maps cimport PreshMap
-from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap

+from ..structs cimport AliasC, KBEntryC
 from ..typedefs cimport hash_t
-from ..structs cimport KBEntryC, AliasC
 from .kb cimport KnowledgeBase

 ctypedef vector[KBEntryC] entry_vec
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    # optional data, we can let users configure a DB as the backend for this.
    cdef object _features_table

-
    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
        """Add an entity vector to the vectors table."""
        cdef int64_t new_index = self._vectors_table.size()
        self._vectors_table.push_back(entity_vector)
        return new_index

-
-    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
-                                     int32_t vector_index, int feats_row) nogil:
+    cdef inline int64_t c_add_entity(
+        self,
+        hash_t entity_hash,
+        float freq,
+        int32_t vector_index,
+        int feats_row
+    ) nogil:
        """Add an entry to the vector of entries.
-        After calling this method, make sure to update also the _entry_index using the return value"""
+        After calling this method, make sure to update also the _entry_index
+        using the return value"""
        # This is what we'll map the entity hash key to. It's where the entry will sit
        # in the vector of entries, so we can get it later.
        cdef int64_t new_index = self._entries.size()

-        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
+        # Avoid struct initializer to enable nogil, cf.
+        # https://github.com/cython/cython/issues/1642
        cdef KBEntryC entry
        entry.entity_hash = entity_hash
        entry.vector_index = vector_index
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        self._entries.push_back(entry)
        return new_index

-    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
-        """Connect a mention to a list of potential entities with their prior probabilities .
-        After calling this method, make sure to update also the _alias_index using the return value"""
-        # This is what we'll map the alias hash key to. It's where the alias will be defined
-        # in the vector of aliases.
+    cdef inline int64_t c_add_aliases(
+        self,
+        hash_t alias_hash,
+        vector[int64_t] entry_indices,
+        vector[float] probs
+    ) nogil:
+        """Connect a mention to a list of potential entities with their prior
+        probabilities. After calling this method, make sure to update also the
+        _alias_index using the return value"""
+        # This is what we'll map the alias hash key to. It's where the alias will be
+        # defined in the vector of aliases.
        cdef int64_t new_index = self._aliases_table.size()

        # Avoid struct initializer to enable nogil
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):

    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
        """
-        Initializing the vectors and making sure the first element of each vector is a dummy,
-        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
+        Initializing the vectors and making sure the first element of each vector is a
+        dummy, because the PreshMap maps pointing to indices in these vectors can not
+        contain 0 as value.
        cf. https://github.com/explosion/preshed/issues/17
        """
        cdef int32_t dummy_value = 0
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 cdef class Writer:
    cdef FILE* _fp

-    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
+    cdef int write_header(
+        self, int64_t nr_entries, int64_t entity_vector_length
+    ) except -1
    cdef int write_vector_element(self, float element) except -1
-    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
+    cdef int write_entry(
+        self, hash_t entry_hash, float entry_freq, int32_t vector_index
+    ) except -1

    cdef int write_alias_length(self, int64_t alias_length) except -1
-    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
+    cdef int write_alias_header(
+        self, hash_t alias_hash, int64_t candidate_length
+    ) except -1
    cdef int write_alias(self, int64_t entry_index, float prob) except -1

    cdef int _write(self, void* value, size_t size) except -1
@ -143,12 +161,18 @@ cdef class Writer:
 cdef class Reader:
    cdef FILE* _fp

-    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
+    cdef int read_header(
+        self, int64_t* nr_entries, int64_t* entity_vector_length
+    ) except -1
    cdef int read_vector_element(self, float* element) except -1
-    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
+    cdef int read_entry(
+        self, hash_t* entity_hash, float* freq, int32_t* vector_index
+    ) except -1

    cdef int read_alias_length(self, int64_t* alias_length) except -1
-    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
+    cdef int read_alias_header(
+        self, hash_t* alias_hash, int64_t* candidate_length
+    ) except -1
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1

    cdef int _read(self, void* value, size_t size) except -1
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -1,29 +1,35 @@
 # cython: infer_types=True, profile=True
-from typing import Iterable, Callable, Dict, Any, Union
+from typing import Any, Callable, Dict, Iterable

 import srsly
-from preshed.maps cimport PreshMap
-from cpython.exc cimport PyErr_SetFromErrno
-from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
-from libc.stdint cimport int32_t, int64_t
-from libcpp.vector cimport vector

-from pathlib import Path
+from cpython.exc cimport PyErr_SetFromErrno
+from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+
 import warnings
+from pathlib import Path

 from ..tokens import Span
+
 from ..typedefs cimport hash_t
-from ..errors import Errors, Warnings
+
 from .. import util
+from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList, ensure_path
+
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
+
 from .candidate import Candidate as Candidate


 cdef class InMemoryLookupKB(KnowledgeBase):
-    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
-    to support entity linking of named entities to real-world concepts.
+    """An `InMemoryLookupKB` instance stores unique identifiers for entities
+    and their textual aliases, to support entity linking of named entities to
+    real-world concepts.

    DOCS: https://spacy.io/api/inmemorylookupkb
    """
@ -66,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):

    def add_entity(self, str entity, float freq, vector[float] entity_vector):
        """
-        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
+        Add an entity to the KB, optionally specifying its log probability
+        based on corpus frequency.
        Return the hash of the entity ID/name at the end.
        """
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
@ -78,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):

        # Raise an error if the provided entity vector is not of the correct length
        if len(entity_vector) != self.entity_vector_length:
-            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
+            raise ValueError(
+                Errors.E141.format(
+                    found=len(entity_vector), required=self.entity_vector_length
+                )
+            )

        vector_index = self.c_add_vector(entity_vector=entity_vector)

-        new_index = self.c_add_entity(entity_hash=entity_hash,
-                                      freq=freq,
-                                      vector_index=vector_index,
-                                      feats_row=-1)  # Features table currently not implemented
+        new_index = self.c_add_entity(
+            entity_hash=entity_hash,
+            freq=freq,
+            vector_index=vector_index,
+            feats_row=-1
+        )  # Features table currently not implemented
        self._entry_index[entity_hash] = new_index

        return entity_hash
@ -110,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            else:
                entity_vector = vector_list[i]
                if len(entity_vector) != self.entity_vector_length:
-                    raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
+                    raise ValueError(
+                        Errors.E141.format(
+                            found=len(entity_vector),
+                            required=self.entity_vector_length
+                        )
+                    )

                entry.entity_hash = entity_hash
                entry.freq = freq_list[i]
@ -144,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        previous_alias_nr = self.get_size_aliases()
        # Throw an error if the length of entities and probabilities are not the same
        if not len(entities) == len(probabilities):
-            raise ValueError(Errors.E132.format(alias=alias,
-                                                entities_length=len(entities),
-                                                probabilities_length=len(probabilities)))
+            raise ValueError(
+                Errors.E132.format(
+                    alias=alias,
+                    entities_length=len(entities),
+                    probabilities_length=len(probabilities))
+            )

-        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
+        # Throw an error if the probabilities sum up to more than 1 (allow for
+        # some rounding errors)
        prob_sum = sum(probabilities)
        if prob_sum > 1.00001:
            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
@ -165,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):

        for entity, prob in zip(entities, probabilities):
            entity_hash = self.vocab.strings[entity]
-            if not entity_hash in self._entry_index:
+            if entity_hash not in self._entry_index:
                raise ValueError(Errors.E134.format(entity=entity))

            entry_index = <int64_t>self._entry_index.get(entity_hash)
            entry_indices.push_back(int(entry_index))
            probs.push_back(float(prob))

-        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+        new_index = self.c_add_aliases(
+            alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
+        )
        self._alias_index[alias_hash] = new_index

        if previous_alias_nr + 1 != self.get_size_aliases():
            raise RuntimeError(Errors.E891.format(alias=alias))
        return alias_hash

-    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
+    def append_alias(
+        self, str alias, str entity, float prior_prob, ignore_warnings=False
+    ):
        """
-        For an alias already existing in the KB, extend its potential entities with one more.
+        For an alias already existing in the KB, extend its potential entities
+        with one more.
        Throw a warning if either the alias or the entity is unknown,
        or when the combination is already previously recorded.
        Throw an error if this entity+prior prob would exceed the sum of 1.
-        For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
+        For efficiency, it's best to use the method `add_alias` as much as
+        possible instead of this one.
        """
        # Check if the alias exists in the KB
        cdef hash_t alias_hash = self.vocab.strings[alias]
-        if not alias_hash in self._alias_index:
+        if alias_hash not in self._alias_index:
            raise ValueError(Errors.E176.format(alias=alias))

        # Check if the entity exists in the KB
        cdef hash_t entity_hash = self.vocab.strings[entity]
-        if not entity_hash in self._entry_index:
+        if entity_hash not in self._entry_index:
            raise ValueError(Errors.E134.format(entity=entity))
        entry_index = <int64_t>self._entry_index.get(entity_hash)

-        # Throw an error if the prior probabilities (including the new one) sum up to more than 1
+        # Throw an error if the prior probabilities (including the new one)
+        # sum up to more than 1
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
        current_sum = sum([p for p in alias_entry.probs])
@ -231,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):

    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
        """
-        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
-        and the prior probability of that alias resolving to that entity.
+        Return candidate entities for an alias. Each candidate defines the
+        entity, the original alias, and the prior probability of that alias
+        resolving to that entity.
        If the alias is not known in the KB, and empty list is returned.
        """
        cdef hash_t alias_hash = self.vocab.strings[alias]
-        if not alias_hash in self._alias_index:
+        if alias_hash not in self._alias_index:
            return []
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
@ -244,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        return [Candidate(kb=self,
                          entity_hash=self._entries[entry_index].entity_hash,
                          entity_freq=self._entries[entry_index].freq,
-                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                          entity_vector=self._vectors_table[
+                              self._entries[entry_index].vector_index
+                          ],
                          alias_hash=alias_hash,
                          prior_prob=prior_prob)
-                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                for (entry_index, prior_prob) in zip(
+                    alias_entry.entry_indices, alias_entry.probs
+                )
                if entry_index != 0]

    def get_vector(self, str entity):
@ -261,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        return self._vectors_table[self._entries[entry_index].vector_index]

    def get_prior_prob(self, str entity, str alias):
-        """ Return the prior probability of a given alias being linked to a given entity,
-        or return 0.0 when this combination is not known in the knowledge base"""
+        """ Return the prior probability of a given alias being linked to a
+        given entity, or return 0.0 when this combination is not known in the
+        knowledge base."""
        cdef hash_t alias_hash = self.vocab.strings[alias]
        cdef hash_t entity_hash = self.vocab.strings[entity]

@ -273,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        entry_index = self._entry_index[entity_hash]

        alias_entry = self._aliases_table[alias_index]
-        for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
+        for (entry_index, prior_prob) in zip(
+            alias_entry.entry_indices, alias_entry.probs
+        ):
            if self._entries[entry_index].entity_hash == entity_hash:
                return prior_prob

@ -283,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        """Serialize the current state to a binary string.
        """
        def serialize_header():
-            header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
+            header = (
+                self.get_size_entities(),
+                self.get_size_aliases(),
+                self.entity_vector_length
+            )
            return srsly.json_dumps(header)

        def serialize_entries():
            i = 1
            tuples = []
-            for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
+            for entry_hash, entry_index in sorted(
+                self._entry_index.items(), key=lambda x: x[1]
+            ):
                entry = self._entries[entry_index]
                assert entry.entity_hash == entry_hash
                assert entry_index == i
@ -302,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            headers = []
            indices_lists = []
            probs_lists = []
-            for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+            for alias_hash, alias_index in sorted(
+                self._alias_index.items(), key=lambda x: x[1]
+            ):
                alias = self._aliases_table[alias_index]
                assert alias_index == i
                candidate_length = len(alias.entry_indices)
@ -360,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            indices = srsly.json_loads(all_data[1])
            probs = srsly.json_loads(all_data[2])
            for header, indices, probs in zip(headers, indices, probs):
-                alias_hash, candidate_length = header
+                alias_hash, _candidate_length = header
                alias.entry_indices = indices
                alias.probs = probs
                self._aliases_table[i] = alias
@ -409,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
                writer.write_vector_element(element)
            i = i+1

-        # dumping the entry records in the order in which they are in the _entries vector.
-        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        # dumping the entry records in the order in which they are in the
+        # _entries vector.
+        # index 0 is a dummy object not stored in the _entry_index and can
+        # be ignored.
        i = 1
-        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
+        for entry_hash, entry_index in sorted(
+            self._entry_index.items(), key=lambda x: x[1]
+        ):
            entry = self._entries[entry_index]
            assert entry.entity_hash == entry_hash
            assert entry_index == i
@ -424,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
        # dumping the aliases in the order in which they are in the _alias_index vector.
        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
        i = 1
-        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+        for alias_hash, alias_index in sorted(
+                self._alias_index.items(), key=lambda x: x[1]
+        ):
            alias = self._aliases_table[alias_index]
            assert alias_index == i

@ -530,7 +581,8 @@ cdef class Writer:
    def __init__(self, path):
        assert isinstance(path, Path)
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
+        cdef bytes bytes_loc = content.encode('utf8') \
+            if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'wb')
        if not self._fp:
            raise IOError(Errors.E146.format(path=path))
@ -540,14 +592,18 @@ cdef class Writer:
        cdef size_t status = fclose(self._fp)
        assert status == 0

-    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
+    cdef int write_header(
+        self, int64_t nr_entries, int64_t entity_vector_length
+    ) except -1:
        self._write(&nr_entries, sizeof(nr_entries))
        self._write(&entity_vector_length, sizeof(entity_vector_length))

    cdef int write_vector_element(self, float element) except -1:
        self._write(&element, sizeof(element))

-    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
+    cdef int write_entry(
+        self, hash_t entry_hash, float entry_freq, int32_t vector_index
+    ) except -1:
        self._write(&entry_hash, sizeof(entry_hash))
        self._write(&entry_freq, sizeof(entry_freq))
        self._write(&vector_index, sizeof(vector_index))
@ -556,7 +612,9 @@ cdef class Writer:
    cdef int write_alias_length(self, int64_t alias_length) except -1:
        self._write(&alias_length, sizeof(alias_length))

-    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
+    cdef int write_alias_header(
+        self, hash_t alias_hash, int64_t candidate_length
+    ) except -1:
        self._write(&alias_hash, sizeof(alias_hash))
        self._write(&candidate_length, sizeof(candidate_length))

@ -572,16 +630,19 @@ cdef class Writer:
 cdef class Reader:
    def __init__(self, path):
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
+        cdef bytes bytes_loc = content.encode('utf8') \
+            if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'rb')
        if not self._fp:
            PyErr_SetFromErrno(IOError)
-        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
+        fseek(self._fp, 0, 0)  # this can be 0 if there is no header

    def __dealloc__(self):
        fclose(self._fp)

-    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
+    cdef int read_header(
+        self, int64_t* nr_entries, int64_t* entity_vector_length
+    ) except -1:
        status = self._read(nr_entries, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
@ -601,7 +662,9 @@ cdef class Reader:
                return 0  # end of file
            raise IOError(Errors.E145.format(param="vector element"))

-    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
+    cdef int read_entry(
+        self, hash_t* entity_hash, float* freq, int32_t* vector_index
+    ) except -1:
        status = self._read(entity_hash, sizeof(hash_t))
        if status < 1:
            if feof(self._fp):
@ -632,7 +695,9 @@ cdef class Reader:
                return 0  # end of file
            raise IOError(Errors.E145.format(param="alias length"))

-    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
+    cdef int read_alias_header(
+        self, hash_t* alias_hash, int64_t* candidate_length
+    ) except -1:
        status = self._read(alias_hash, sizeof(hash_t))
        if status < 1:
            if feof(self._fp):
--- a/spacy/lang/af/init.py
+++ b/spacy/lang/af/init.py
@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults


 class AfrikaansDefaults(BaseDefaults):
--- a/spacy/lang/am/init.py
+++ b/spacy/lang/am/init.py
@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc


 class AmharicDefaults(BaseDefaults):
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA_UPPER,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)

 _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()

--- a/spacy/lang/am/tokenizer_exceptions.py
+++ b/spacy/lang/am/tokenizer_exceptions.py
@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH

 _exc = {}

--- a/spacy/lang/ar/init.py
+++ b/spacy/lang/ar/init.py
@ -1,8 +1,8 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults


 class ArabicDefaults(BaseDefaults):
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA_UPPER,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)

 _suffixes = (
    LIST_PUNCT
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc = {}

--- a/spacy/lang/az/init.py
+++ b/spacy/lang/az/init.py
@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS


 class AzerbaijaniDefaults(BaseDefaults):
--- a/spacy/lang/az/lex_attrs.py
+++ b/spacy/lang/az/lex_attrs.py
@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM

-
 # Eleven, twelve etc. are written separate: on bir, on iki

 _num_words = [
--- a/spacy/lang/bg/init.py
+++ b/spacy/lang/bg/init.py
@ -1,12 +1,14 @@
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..punctuation import (
+    COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+    COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc


 class BulgarianDefaults(BaseDefaults):
--- a/spacy/lang/bg/lex_attrs.py
+++ b/spacy/lang/bg/lex_attrs.py
@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM

-
 _num_words = [
    "нула",
    "едно",
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@ -4,8 +4,7 @@ References:
    (countries, occupations, fields of studies and more).
 """

-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH

 _exc = {}

--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -1,10 +1,12 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
 from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class BengaliDefaults(BaseDefaults):
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)

 _currency = r"\$¢£€¥฿৳"
 _quotes = CONCAT_QUOTES.replace("'", "")
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc = {}

--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -1,14 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional

 from thinc.api import Model

-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
 from .lemmatizer import CatalanLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class CatalanDefaults(BaseDefaults):
--- a/spacy/lang/ca/lex_attrs.py
+++ b/spacy/lang/ca/lex_attrs.py
@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM

-
 _num_words = [
    "zero",
    "un",
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@ -1,9 +1,18 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import LIST_CURRENCY
-from ..char_classes import CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-from ..char_classes import merge_chars, _units
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    _units,
+    merge_chars,
+)

 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")

--- a/spacy/lang/ca/syntax_iterators.py
+++ b/spacy/lang/ca/syntax_iterators.py
@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN
+from typing import Iterator, Tuple, Union
+
 from ...errors import Errors
+from ...symbols import NOUN, PROPN
+from ...tokens import Doc, Span


 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc = {}

--- a/spacy/lang/cs/init.py
+++ b/spacy/lang/cs/init.py
@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS


 class CzechDefaults(BaseDefaults):
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class DanishDefaults(BaseDefaults):
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM

-
 # Source http://fjern-uv.dk/tal.php
 _num_words = """nul
 en et to tre fire fem seks syv otte ni ti
--- a/spacy/lang/da/punctuation.py
+++ b/spacy/lang/da/punctuation.py
@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 from ..punctuation import TOKENIZER_SUFFIXES

-
 _quotes = CONCAT_QUOTES.replace("'", "")

 _infixes = (
--- a/spacy/lang/da/syntax_iterators.py
+++ b/spacy/lang/da/syntax_iterators.py
@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from typing import Iterator, Tuple, Union
+
 from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span


 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -2,10 +2,9 @@
 Tokenizer Exceptions.
 Source: https://forkortelse.dk/ and various others.
 """
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc = {}

--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class GermanDefaults(BaseDefaults):
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@ -1,9 +1,18 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    UNITS,
+)
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES

-
 _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES

 _suffixes = (
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union

-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span


--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc = {
    "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
--- a/spacy/lang/dsb/init.py
+++ b/spacy/lang/dsb/init.py
@ -1,6 +1,6 @@
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults


 class LowerSorbianDefaults(BaseDefaults):
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model

-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
 from .lemmatizer import GreekLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class GreekDefaults(BaseDefaults):
--- a/spacy/lang/el/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/get_pos_from_wiktionary.py
@ -1,5 +1,6 @@
 def get_pos_from_wiktionary():
    import re
+
    from gensim.corpora.wikicorpus import extract_pages

    regex = re.compile(r"==={{(\w+)\|el}}===")
--- a/spacy/lang/el/punctuation.py
+++ b/spacy/lang/el/punctuation.py
@ -1,6 +1,16 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES, CURRENCY
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    HYPHENS,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+)

 _units = (
    "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union

-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span


--- a/spacy/lang/el/tokenizer_exceptions.py
+++ b/spacy/lang/el/tokenizer_exceptions.py
@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc = {}

--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model

-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
 from .lemmatizer import EnglishLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class EnglishDefaults(BaseDefaults):
--- a/spacy/lang/en/punctuation.py
+++ b/spacy/lang/en/punctuation.py
@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)

 _infixes = (
    LIST_ELLIPSES
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union

-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span


--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -1,8 +1,8 @@
 from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc

+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

 _exc: Dict[str, List[Dict]] = {}
 _exclude = [
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+
+from ...language import BaseDefaults, Language
 from .lemmatizer import SpanishLemmatizer
-from .syntax_iterators import SYNTAX_ITERATORS
+from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class SpanishDefaults(BaseDefaults):
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@ -1,5 +1,5 @@
-from typing import List, Optional, Tuple
 import re
+from typing import List, Optional, Tuple

 from ...pipeline import Lemmatizer
 from ...tokens import Token
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM

-
 _num_words = [
    "cero",
    "uno",
--- a/Show More
+++ b/Show More