diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
deleted file mode 100644
index 20d4582cb..000000000
--- a/.github/azure-steps.yml
+++ /dev/null
@@ -1,118 +0,0 @@
-parameters:
-  python_version: ''
-  architecture: 'x64'
-  num_build_jobs: 2
-
-steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: ${{ parameters.python_version }}
-      architecture: ${{ parameters.architecture }}
-      allowUnstable: true
-
-  - bash: |
-      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
-    displayName: 'Set variables'
-
-  - script: |
-      python -m pip install -U build pip setuptools
-      python -m pip install -U -r requirements.txt
-    displayName: "Install dependencies"
-
-  - script: |
-      python -m build --sdist
-    displayName: "Build sdist"
-
-  - script: |
-      python -m mypy spacy
-    displayName: 'Run mypy'
-    condition: ne(variables['python_version'], '3.6')
-
-  - task: DeleteFiles@1
-    inputs:
-      contents: "spacy"
-    displayName: "Delete source directory"
-
-  - task: DeleteFiles@1
-    inputs:
-      contents: "*.egg-info"
-    displayName: "Delete egg-info directory"
-
-  - script: |
-      python -m pip freeze > installed.txt
-      python -m pip uninstall -y -r installed.txt
-    displayName: "Uninstall all packages"
-
-  - bash: |
-      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
-    displayName: "Install from sdist"
-
-  - script: |
-      python -W error -c "import spacy"
-    displayName: "Test import"
-
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -W error -m spacy info ca_core_news_sm | grep -q download_url
-    displayName: 'Test download_url in info CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-    displayName: 'Test no warnings on load (#11713)'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
-    displayName: 'Test convert CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m spacy init config -p ner -l ca ner.cfg
-      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
-    displayName: 'Test debug config CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      # will have errors due to sparse data, check for summary in output
-      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
-    displayName: 'Test debug data CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
-    displayName: 'Test train CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.9')
-
-  - script: |
-      python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      python -m pytest --pyargs spacy -W error
-    displayName: "Run CPU tests"
-
-  - script: |
-      python -m pip install 'spacy[apple]'
-      python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index 6b472cd12..910cfdc40 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -8,6 +8,7 @@ on:
 
 jobs:
   explosion-bot:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - name: Dump GitHub context
diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml
index 8f3a151ea..6c7d7d5a6 100644
--- a/.github/workflows/issue-manager.yml
+++ b/.github/workflows/issue-manager.yml
@@ -13,6 +13,7 @@ on:
 
 jobs:
   issue-manager:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - uses: tiangolo/issue-manager@0.4.0
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
index 794adee85..6c3985a93 100644
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -13,6 +13,7 @@ concurrency:
 
 jobs:
   action:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - uses: dessant/lock-threads@v4
diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml
index 837aaeb33..33851fbcc 100644
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@@ -7,6 +7,7 @@ on:
 
 jobs:
   build:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
 
     steps:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 41ea6ce50..f177fbcb6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -21,6 +21,7 @@ on:
 jobs:
   validate:
     name: Validate
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
@@ -36,6 +37,10 @@ jobs:
         run: |
           python -m pip install black -c requirements.txt
           python -m black spacy --check
+      - name: isort
+        run: |
+          python -m pip install isort -c requirements.txt
+          python -m isort spacy --check
       - name: flake8
         run: |
           python -m pip install flake8==5.0.4
@@ -106,22 +111,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+#      - name: "Test download CLI"
+#        run: |
+#          python -m spacy download ca_core_news_sm
+#          python -m spacy download ca_core_news_md
+#          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test download_url in info CLI"
+#        run: |
+#          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test no warnings on load (#11713)"
+#        run: |
+#          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -145,17 +150,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+#      - name: "Test assemble CLI"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test assemble CLI vectors warning"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
@@ -164,6 +169,7 @@ jobs:
       - name: "Run CPU tests"
         run: |
           python -m pytest --pyargs spacy -W error
+        if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
 
       - name: "Run CPU tests with thinc-apple-ops"
         run: |
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index f9e317aaa..a1e3253a9 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -16,6 +16,7 @@ on:
 jobs:
   validate:
     name: Validate
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
diff --git a/README.md b/README.md
index 36a015caf..59d3ee9ee 100644
--- a/README.md
+++ b/README.md
@@ -35,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
 
 ## 📖 Documentation
 
-| Documentation                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                              |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ⭐️ **[spaCy 101]**                                                                                                                                                                                                       | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                                            |
-| 📚 **[Usage Guides]**                                                                                                                                                                                                     | How to use spaCy and its features.                                                                                                                                                                                                                                                                                           |
-| 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                               |
-| 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                          |
-| 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                      |
-| 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                        |
-| 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                               |
-| 👩‍🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                      |
-| 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                    |
-| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
-| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
+| Documentation                 |                                                                        |
+| ----------------------------- | ---------------------------------------------------------------------- |
+| ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      |
+| 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     |
+| 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         |
+| 🪐 **[Project Templates]**    | End-to-end workflows you can clone, modify and run.                    |
+| 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                |
+| 📦 **[Models]**               | Download trained pipelines for spaCy.                                  |
+| 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         |
+| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
+| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
+| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
+| 🛠 **[Changelog]** | Changes and version history. |
+| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
 | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
 
@@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
 [api reference]: https://spacy.io/api/
 [models]: https://spacy.io/models
 [universe]: https://spacy.io/universe
+[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
 [videos]: https://www.youtube.com/c/ExplosionAI
 [online course]: https://course.spacy.io
 [project templates]: https://github.com/explosion/projects
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 
-
 ## 💬 Where to ask questions
 
 The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 83c57a164..000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-trigger:
-  batch: true
-  branches:
-    include:
-      - "*"
-    exclude:
-      - "spacy.io"
-      - "nightly.spacy.io"
-      - "v2.spacy.io"
-  paths:
-    exclude:
-      - "website/*"
-      - "*.md"
-      - "*.mdx"
-      - ".github/workflows/*"
-pr:
-  paths:
-    exclude:
-      - "*.md"
-      - "*.mdx"
-      - "website/docs/*"
-      - "website/src/*"
-      - "website/meta/*.tsx"
-      - "website/meta/*.mjs"
-      - "website/meta/languages.json"
-      - "website/meta/site.json"
-      - "website/meta/sidebars.json"
-      - "website/meta/type-annotations.json"
-      - "website/pages/*"
-      - ".github/workflows/*"
-
-jobs:
-  # Check formatting and linting. Perform basic checks for most important errors
-  # (syntax etc.) Uses the config defined in setup.cfg and overwrites the
-  # selected codes.
-  - job: "Validate"
-    pool:
-      vmImage: "ubuntu-latest"
-    steps:
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: "3.7"
-      - script: |
-          pip install black -c requirements.txt
-          python -m black spacy --check
-        displayName: "black"
-      - script: |
-          pip install flake8==5.0.4
-          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
-        displayName: "flake8"
-      - script: |
-          python .github/validate_universe_json.py website/meta/universe.json
-        displayName: 'Validate website/meta/universe.json'
-
-  - job: "Test"
-    dependsOn: "Validate"
-    strategy:
-      matrix:
-        # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-20.04"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-20.04"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
-        #        Python38Linux:
-        #          imageName: "ubuntu-latest"
-        #          python.version: "3.8"
-        #        Python38Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.8"
-        Python38Mac:
-          imageName: "macos-latest"
-          python.version: "3.8"
-        Python39Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.9"
-        #        Python39Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.9"
-        #        Python39Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.9"
-        #        Python310Linux:
-        #          imageName: "ubuntu-latest"
-        #          python.version: "3.10"
-        Python310Windows:
-          imageName: "windows-latest"
-          python.version: "3.10"
-        #        Python310Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.10"
-        Python311Linux:
-          imageName: 'ubuntu-latest'
-          python.version: '3.11'
-        Python311Windows:
-          imageName: 'windows-latest'
-          python.version: '3.11'
-        Python311Mac:
-          imageName: 'macos-latest'
-          python.version: '3.11'
-      maxParallel: 4
-    pool:
-      vmImage: $(imageName)
-    steps:
-      - template: .github/azure-steps.yml
-        parameters:
-          python_version: '$(python.version)'
diff --git a/pyproject.toml b/pyproject.toml
index 9cd96ac2d..dcb5cf10d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,3 +9,6 @@ requires = [
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
diff --git a/requirements.txt b/requirements.txt
index 94d6f23f4..f5050fee2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.8.0
+typer>=0.3.0,<0.10.0
 pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
 weasel>=0.1.0,<0.2.0
@@ -39,3 +39,4 @@ types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
 black==22.3.0
+isort>=5.0,<6.0
diff --git a/setup.cfg b/setup.cfg
index 6d3c2f12c..048bb3719 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -53,7 +53,7 @@ install_requires =
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.2.0
     # Third-party dependencies
-    typer>=0.3.0,<0.8.0
+    typer>=0.3.0,<0.10.0
     pathy>=0.10.0
     smart-open>=5.2.1,<7.0.0
     tqdm>=4.38.0,<5.0.0
@@ -79,41 +79,41 @@ transformers =
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<12.0.0
+    cupy>=5.0.0b4,<13.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<12.0.0
+    cupy-cuda80>=5.0.0b4,<13.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<12.0.0
+    cupy-cuda90>=5.0.0b4,<13.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<12.0.0
+    cupy-cuda91>=5.0.0b4,<13.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<12.0.0
+    cupy-cuda92>=5.0.0b4,<13.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<12.0.0
+    cupy-cuda100>=5.0.0b4,<13.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<12.0.0
+    cupy-cuda101>=5.0.0b4,<13.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<12.0.0
+    cupy-cuda102>=5.0.0b4,<13.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<12.0.0
+    cupy-cuda110>=5.0.0b4,<13.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<12.0.0
+    cupy-cuda111>=5.0.0b4,<13.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<12.0.0
+    cupy-cuda112>=5.0.0b4,<13.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<12.0.0
+    cupy-cuda113>=5.0.0b4,<13.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<12.0.0
+    cupy-cuda114>=5.0.0b4,<13.0.0
 cuda115 =
-    cupy-cuda115>=5.0.0b4,<12.0.0
+    cupy-cuda115>=5.0.0b4,<13.0.0
 cuda116 =
-    cupy-cuda116>=5.0.0b4,<12.0.0
+    cupy-cuda116>=5.0.0b4,<13.0.0
 cuda117 =
-    cupy-cuda117>=5.0.0b4,<12.0.0
+    cupy-cuda117>=5.0.0b4,<13.0.0
 cuda11x =
-    cupy-cuda11x>=11.0.0,<12.0.0
+    cupy-cuda11x>=11.0.0,<13.0.0
 cuda-autodetect =
-    cupy-wheel>=11.0.0,<12.0.0
+    cupy-wheel>=11.0.0,<13.0.0
 apple =
     thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
diff --git a/spacy/__init__.py b/spacy/__init__.py
index c3568bc5c..1a18ad0d5 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,6 +1,6 @@
-from typing import Union, Iterable, Dict, Any
-from pathlib import Path
 import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, Union
 
 # set library-specific custom warning handling before doing anything else
 from .errors import setup_default_warnings
@@ -8,20 +8,17 @@ from .errors import setup_default_warnings
 setup_default_warnings()  # noqa: E402
 
 # These are imported as part of the API
-from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
-from thinc.api import Config
+from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401
 
 from . import pipeline  # noqa: F401
-from .cli.info import info  # noqa: F401
-from .glossary import explain  # noqa: F401
-from .about import __version__  # noqa: F401
-from .util import registry, logger  # noqa: F401
-
-from .errors import Errors
-from .language import Language
-from .vocab import Vocab
 from . import util
-
+from .about import __version__  # noqa: F401
+from .cli.info import info  # noqa: F401
+from .errors import Errors
+from .glossary import explain  # noqa: F401
+from .language import Language
+from .util import logger, registry  # noqa: F401
+from .vocab import Vocab
 
 if sys.maxunicode == 65535:
     raise SystemError(Errors.E130)
diff --git a/spacy/about.py b/spacy/about.py
index 4748d655c..3319860f1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "3.6.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 33d5372de..6dc9ecaee 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,6 +1,7 @@
 # Reserve 64 values for flag features
 from . cimport symbols
 
+
 cdef enum attr_id_t:
     NULL_ATTR
     IS_ALPHA
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index efabcb9cf..549a27616 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -1,28 +1,35 @@
 from wasabi import msg
 
 from ._util import app, setup_cli  # noqa: F401
+from .apply import apply  # noqa: F401
+from .assemble import assemble_cli  # noqa: F401
 
 # These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 # are registered automatically and won't have to be imported here.
 from .benchmark_speed import benchmark_speed_cli  # noqa: F401
-from .download import download  # noqa: F401
-from .info import info  # noqa: F401
-from .package import package  # noqa: F401
-from .profile import profile  # noqa: F401
-from .train import train_cli  # noqa: F401
-from .assemble import assemble_cli  # noqa: F401
-from .pretrain import pretrain  # noqa: F401
-from .debug_data import debug_data  # noqa: F401
-from .debug_config import debug_config  # noqa: F401
-from .debug_model import debug_model  # noqa: F401
-from .debug_diff import debug_diff  # noqa: F401
-from .evaluate import evaluate  # noqa: F401
-from .apply import apply  # noqa: F401
 from .convert import convert  # noqa: F401
-from .init_pipeline import init_pipeline_cli  # noqa: F401
-from .init_config import init_config, fill_config  # noqa: F401
-from .validate import validate  # noqa: F401
+from .debug_config import debug_config  # noqa: F401
+from .debug_data import debug_data  # noqa: F401
+from .debug_diff import debug_diff  # noqa: F401
+from .debug_model import debug_model  # noqa: F401
+from .download import download  # noqa: F401
+from .evaluate import evaluate  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
+from .info import info  # noqa: F401
+from .init_config import fill_config, init_config  # noqa: F401
+from .init_pipeline import init_pipeline_cli  # noqa: F401
+from .package import package  # noqa: F401
+from .pretrain import pretrain  # noqa: F401
+from .profile import profile  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.document import project_document  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401
+from .project.pull import project_pull  # noqa: F401
+from .project.push import project_push  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .train import train_cli  # noqa: F401
+from .validate import validate  # noqa: F401
 
 
 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 4e8102e3d..b48e928f5 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,25 +1,45 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
+import os
+import shutil
+import sys
+from configparser import InterpolationError
+from contextlib import contextmanager
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
+import srsly
 import typer
 from click import NoSuchOption
 from click.parser import split_arg_string
-from typer.main import get_command
-from contextlib import contextmanager
-from thinc.api import ConfigValidationError, require_gpu
+from thinc.api import Config, ConfigValidationError, require_gpu
 from thinc.util import gpu_is_available
-from configparser import InterpolationError
-import os
-
+from typer.main import get_command
+from wasabi import Printer, msg
 from weasel import app as project_cli
 
+from .. import about
 from ..compat import Literal
-from ..util import import_file, run_command, registry, logger, ENV_VARS
+from ..schemas import ProjectConfigSchema, validate
+from ..util import (
+    ENV_VARS,
+    SimpleFrozenDict,
+    import_file,
+    is_compatible_version,
+    logger,
+    make_tempdir,
+    registry,
+    run_command,
+)
 
 if TYPE_CHECKING:
     from pathy import FluidPath  # noqa: F401
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
index f0df4e757..8c4b4c8bf 100644
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@@ -1,18 +1,15 @@
-import tqdm
-import srsly
-
 from itertools import chain
 from pathlib import Path
-from typing import Optional, List, Iterable, cast, Union
+from typing import Iterable, List, Optional, Union, cast
 
+import srsly
+import tqdm
 from wasabi import msg
 
-from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
-
 from ..tokens import Doc, DocBin
-from ..vocab import Vocab
 from ..util import ensure_path, load_model
-
+from ..vocab import Vocab
+from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
 
 path_help = """Location of the documents to predict on.
 Can be a single file in .spacy format or a .jsonl file.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index 1cfa290a3..ee2500b27 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -1,13 +1,20 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
 import logging
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
 
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
 from .. import util
 from ..util import get_sourced_components, load_model_from_config
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code,
+    parse_config_overrides,
+    show_validation_error,
+)
 
 
 @app.command(
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
index 4eb20a5fa..a683d1591 100644
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@@ -1,11 +1,12 @@
-from typing import Iterable, List, Optional
 import random
-from itertools import islice
-import numpy
-from pathlib import Path
 import time
-from tqdm import tqdm
+from itertools import islice
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+import numpy
 import typer
+from tqdm import tqdm
 from wasabi import msg
 
 from .. import util
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 68d454b3e..a66a68133 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,18 +1,22 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, Union
-from enum import Enum
-from pathlib import Path
-from wasabi import Printer
-import srsly
+import itertools
 import re
 import sys
-import itertools
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Iterable, Mapping, Optional, Union
+
+import srsly
+from wasabi import Printer
 
-from ._util import app, Arg, Opt, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
-from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
-from ..training.converters import conllu_to_docs
-
+from ..training import docs_to_json
+from ..training.converters import (
+    conll_ner_to_docs,
+    conllu_to_docs,
+    iob_to_docs,
+    json_to_docs,
+)
+from ._util import Arg, Opt, app, walk_directory
 
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 409fac4ed..0e5382cd9 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -1,15 +1,22 @@
-from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
-from wasabi import msg, table
+from typing import Any, Dict, List, Optional, Union
+
+import typer
 from thinc.api import Config
 from thinc.config import VARIABLE_RE
-import typer
+from wasabi import msg, table
 
-from ._util import Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli
+from .. import util
 from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
 from ..util import registry
-from .. import util
+from ._util import (
+    Arg,
+    Opt,
+    debug_cli,
+    import_code,
+    parse_config_overrides,
+    show_validation_error,
+)
 
 
 @debug_cli.command(
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 97b4db285..af3c24f3b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,31 +1,49 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
-import numpy
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+    cast,
+    overload,
+)
 
-from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, _format_number
-from ..training import Example, remove_bilu_prefix
-from ..training.initialize import get_sourced_components
-from ..schemas import ConfigSchemaTraining
-from ..pipeline import TrainablePipe
+import numpy
+import srsly
+import typer
+from wasabi import MESSAGES, Printer, msg
+
+from .. import util
+from ..compat import Literal
+from ..language import Language
+from ..morphology import Morphology
+from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
+from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer, SpanCategorizer
-from ..pipeline._edit_tree_internals.edit_trees import EditTrees
-from ..morphology import Morphology
-from ..language import Language
+from ..schemas import ConfigSchemaTraining
+from ..training import Example, remove_bilu_prefix
+from ..training.initialize import get_sourced_components
 from ..util import registry, resolve_dot_names
-from ..compat import Literal
 from ..vectors import Mode as VectorsMode
-from .. import util
-
+from ._util import (
+    Arg,
+    Opt,
+    _format_number,
+    app,
+    debug_cli,
+    import_code,
+    parse_config_overrides,
+    show_validation_error,
+)
 
 # Minimum number of expected occurrences of NER label in data to train new label
 NEW_LABEL_THRESHOLD = 50
@@ -212,7 +230,7 @@ def debug_data(
     else:
         msg.info("No word vectors present in the package")
 
-    if "spancat" in factory_names:
+    if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
         model_labels_spancat = _get_labels_from_spancat(nlp)
         has_low_data_warning = False
         has_no_neg_warning = False
@@ -337,7 +355,7 @@ def debug_data(
                 show=verbose,
             )
         else:
-            msg.good("Examples without ocurrences available for all labels")
+            msg.good("Examples without occurrences available for all labels")
 
     if "ner" in factory_names:
         # Get all unique NER labels present in the data
@@ -830,7 +848,7 @@ def _compile_gold(
                     data["boundary_cross_ents"] += 1
                 elif label == "-":
                     data["ner"]["-"] += 1
-        if "spancat" in factory_names:
+        if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
             for spans_key in list(eg.reference.spans.keys()):
                 # Obtain the span frequency
                 if spans_key not in data["spancat"]:
@@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
     pipe_names = [
         pipe_name
         for pipe_name in nlp.pipe_names
-        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+        if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
     ]
     labels: Dict[str, Set[str]] = {}
     for pipe_name in pipe_names:
diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py
index 6697c38ae..c53b0acab 100644
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@@ -1,13 +1,13 @@
+from pathlib import Path
 from typing import Optional
 
 import typer
-from wasabi import Printer, diff_strings, MarkdownRenderer
-from pathlib import Path
 from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, diff_strings
 
-from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
 from ..util import load_config
-from .init_config import init_config, Optimizations
+from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
+from .init_config import Optimizations, init_config
 
 
 @debug_cli.command(
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 190094d81..8a0fd4889 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,19 +1,32 @@
-from typing import Dict, Any, Optional
-from pathlib import Path
 import itertools
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import typer
+from thinc.api import (
+    Model,
+    data_validation,
+    fix_random_seed,
+    set_dropout_rate,
+    set_gpu_allocator,
+)
+from wasabi import msg
 
 from spacy.training import Example
 from spacy.util import resolve_dot_names
-from wasabi import msg
-from thinc.api import fix_random_seed, set_dropout_rate
-from thinc.api import Model, data_validation, set_gpu_allocator
-import typer
 
-from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list, setup_gpu
+from .. import util
 from ..schemas import ConfigSchemaTraining
 from ..util import registry
-from .. import util
+from ._util import (
+    Arg,
+    Opt,
+    debug_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+    string_to_list,
+)
 
 
 @debug_cli.command(
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0c9a32b93..de731b0fd 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -1,14 +1,14 @@
-from typing import Optional, Sequence
-import requests
 import sys
-from wasabi import msg
-import typer
+from typing import Optional, Sequence
+
+import requests
+import typer
+from wasabi import msg
 
-from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
 from ..errors import OLD_MODEL_SHORTCUTS
+from ..util import get_minor_version, is_package, is_prerelease_version, run_command
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
@@ -81,11 +81,8 @@ def download(
 
 def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
     dl_tpl = "{m}-{v}/{m}-{v}{s}"
-    egg_tpl = "#egg={m}=={v}"
     suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
     filename = dl_tpl.format(m=model_name, v=version, s=suffix)
-    if sdist:
-        filename += egg_tpl.format(m=model_name, v=version)
     return filename
 
 
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 8f3d6b859..6235b658d 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,16 +1,16 @@
-from typing import Optional, List, Dict, Any, Union
-from wasabi import Printer
-from pathlib import Path
 import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
 import srsly
 from thinc.api import fix_random_seed
+from wasabi import Printer
 
-from ..training import Corpus
-from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
+from .. import displacy, util
 from ..scorer import Scorer
-from .. import util
-from .. import displacy
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
 
 
 @benchmark_cli.command(
@@ -27,6 +27,7 @@ def evaluate_cli(
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
     # fmt: on
 ):
     """
@@ -50,6 +51,7 @@ def evaluate_cli(
         gold_preproc=gold_preproc,
         displacy_path=displacy_path,
         displacy_limit=displacy_limit,
+        per_component=per_component,
         silent=False,
     )
 
@@ -64,6 +66,7 @@ def evaluate(
     displacy_limit: int = 25,
     silent: bool = True,
     spans_key: str = "sc",
+    per_component: bool = False,
 ) -> Dict[str, Any]:
     msg = Printer(no_print=silent, pretty=not silent)
     fix_random_seed()
@@ -78,50 +81,61 @@ def evaluate(
     corpus = Corpus(data_path, gold_preproc=gold_preproc)
     nlp = util.load_model(model)
     dev_dataset = list(corpus(nlp))
-    scores = nlp.evaluate(dev_dataset)
-    metrics = {
-        "TOK": "token_acc",
-        "TAG": "tag_acc",
-        "POS": "pos_acc",
-        "MORPH": "morph_acc",
-        "LEMMA": "lemma_acc",
-        "UAS": "dep_uas",
-        "LAS": "dep_las",
-        "NER P": "ents_p",
-        "NER R": "ents_r",
-        "NER F": "ents_f",
-        "TEXTCAT": "cats_score",
-        "SENT P": "sents_p",
-        "SENT R": "sents_r",
-        "SENT F": "sents_f",
-        "SPAN P": f"spans_{spans_key}_p",
-        "SPAN R": f"spans_{spans_key}_r",
-        "SPAN F": f"spans_{spans_key}_f",
-        "SPEED": "speed",
-    }
-    results = {}
-    data = {}
-    for metric, key in metrics.items():
-        if key in scores:
-            if key == "cats_score":
-                metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
-            if isinstance(scores[key], (int, float)):
-                if key == "speed":
-                    results[metric] = f"{scores[key]:.0f}"
+    scores = nlp.evaluate(dev_dataset, per_component=per_component)
+    if per_component:
+        data = scores
+        if output is None:
+            msg.warn(
+                "The per-component option is enabled but there is no output JSON file provided to save the scores to."
+            )
+        else:
+            msg.info("Per-component scores will be saved to output JSON file.")
+    else:
+        metrics = {
+            "TOK": "token_acc",
+            "TAG": "tag_acc",
+            "POS": "pos_acc",
+            "MORPH": "morph_acc",
+            "LEMMA": "lemma_acc",
+            "UAS": "dep_uas",
+            "LAS": "dep_las",
+            "NER P": "ents_p",
+            "NER R": "ents_r",
+            "NER F": "ents_f",
+            "TEXTCAT": "cats_score",
+            "SENT P": "sents_p",
+            "SENT R": "sents_r",
+            "SENT F": "sents_f",
+            "SPAN P": f"spans_{spans_key}_p",
+            "SPAN R": f"spans_{spans_key}_r",
+            "SPAN F": f"spans_{spans_key}_f",
+            "SPEED": "speed",
+        }
+        results = {}
+        data = {}
+        for metric, key in metrics.items():
+            if key in scores:
+                if key == "cats_score":
+                    metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
+                if isinstance(scores[key], (int, float)):
+                    if key == "speed":
+                        results[metric] = f"{scores[key]:.0f}"
+                    else:
+                        results[metric] = f"{scores[key]*100:.2f}"
                 else:
-                    results[metric] = f"{scores[key]*100:.2f}"
-            else:
-                results[metric] = "-"
-            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
+                    results[metric] = "-"
+                data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
 
-    msg.table(results, title="Results")
-    data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
+        msg.table(results, title="Results")
+        data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
 
     if displacy_path:
         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
         render_deps = "parser" in factory_names
         render_ents = "ner" in factory_names
+        render_spans = "spancat" in factory_names
+
         render_parses(
             docs,
             displacy_path,
@@ -129,6 +143,7 @@ def evaluate(
             limit=displacy_limit,
             deps=render_deps,
             ents=render_ents,
+            spans=render_spans,
         )
         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 
@@ -182,6 +197,7 @@ def render_parses(
     limit: int = 250,
     deps: bool = True,
     ents: bool = True,
+    spans: bool = True,
 ):
     docs[0].user_data["title"] = model_name
     if ents:
@@ -195,6 +211,11 @@ def render_parses(
         with (output_path / "parses.html").open("w", encoding="utf8") as file_:
             file_.write(html)
 
+    if spans:
+        html = displacy.render(docs[:limit], style="span", page=True)
+        with (output_path / "spans.html").open("w", encoding="utf8") as file_:
+            file_.write(html)
+
 
 def print_prf_per_type(
     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 6d591053d..7aa32c0c6 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -1,17 +1,17 @@
 import functools
+import logging
 import operator
 from pathlib import Path
-import logging
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy
 import wasabi.tables
 
-from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
-from ..errors import Errors
-from ..training import Corpus
-from ._util import app, Arg, Opt, import_code, setup_gpu
 from .. import util
+from ..errors import Errors
+from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
+from ..training import Corpus
+from ._util import Arg, Opt, app, import_code, setup_gpu
 
 _DEFAULTS = {
     "n_trials": 11,
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index d82bf3fbc..8bfc6b54f 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,15 +1,15 @@
-from typing import Optional, Dict, Any, Union, List
-import platform
 import json
+import platform
 from pathlib import Path
-from wasabi import Printer, MarkdownRenderer
-import srsly
+from typing import Any, Dict, List, Optional, Union
 
-from ._util import app, Arg, Opt, string_to_list
-from .download import get_model_filename, get_latest_version
-from .. import util
-from .. import about
+import srsly
+from wasabi import MarkdownRenderer, Printer
+
+from .. import about, util
 from ..compat import importlib_metadata
+from ._util import Arg, Opt, app, string_to_list
+from .download import get_latest_version, get_model_filename
 
 
 @app.command("info")
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b634caa4c..a7c03d00f 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -1,19 +1,26 @@
-from typing import Optional, List, Tuple
+import re
 from enum import Enum
 from pathlib import Path
-from wasabi import Printer, diff_strings
-from thinc.api import Config
+from typing import List, Optional, Tuple
+
 import srsly
-import re
 from jinja2 import Template
+from thinc.api import Config
+from wasabi import Printer, diff_strings
 
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code
-
+from ._util import (
+    COMMAND,
+    Arg,
+    Opt,
+    import_code,
+    init_cli,
+    show_validation_error,
+    string_to_list,
+)
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index d53a61b8e..13202cb60 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -1,15 +1,23 @@
-from typing import Optional
 import logging
 from pathlib import Path
-from wasabi import msg
-import typer
+from typing import Optional
+
 import srsly
+import typer
+from wasabi import msg
 
 from .. import util
-from ..training.initialize import init_nlp, convert_vectors
 from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+    Arg,
+    Opt,
+    import_code,
+    init_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @init_cli.command("vectors")
@@ -24,6 +32,7 @@ def init_vectors_cli(
     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
+    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
     # fmt: on
 ):
     """Convert word vectors for use with spaCy. Will export an nlp object that
@@ -42,6 +51,7 @@ def init_vectors_cli(
         prune=prune,
         name=name,
         mode=mode,
+        attr=attr,
     )
     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
     nlp.to_disk(output_dir)
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 6351f28eb..4545578e6 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -1,18 +1,18 @@
-from typing import Optional, Union, Any, Dict, List, Tuple, cast
-import shutil
-from pathlib import Path
-from wasabi import Printer, MarkdownRenderer, get_raw_input
-from thinc.api import Config
-from collections import defaultdict
-from catalogue import RegistryError
-import srsly
-import sys
 import re
+import shutil
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
-from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
-from ..schemas import validate, ModelMetaSchema
-from .. import util
-from .. import about
+import srsly
+from catalogue import RegistryError
+from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, get_raw_input
+
+from .. import about, util
+from ..schemas import ModelMetaSchema, validate
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
 
 
 @app.command("package")
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 381d589cf..446c40510 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,13 +1,21 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
 import re
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
 
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
 from ..training.pretrain import pretrain
 from ..util import load_config
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @app.command(
@@ -23,6 +31,7 @@ def pretrain_cli(
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
     # fmt: on
 ):
     """
@@ -74,6 +83,7 @@ def pretrain_cli(
         epoch_resume=epoch_resume,
         use_gpu=use_gpu,
         silent=False,
+        skip_last=skip_last,
     )
     msg.good("Successfully finished pretrain")
 
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 3c282c73d..e1f720327 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -1,17 +1,18 @@
-from typing import Optional, Sequence, Union, Iterator
-import tqdm
-from pathlib import Path
-import srsly
 import cProfile
+import itertools
 import pstats
 import sys
-import itertools
-from wasabi import msg, Printer
-import typer
+from pathlib import Path
+from typing import Iterator, Optional, Sequence, Union
+
+import srsly
+import tqdm
+import typer
+from wasabi import Printer, msg
 
-from ._util import app, debug_cli, Arg, Opt, NAME
 from ..language import Language
 from ..util import load_model
+from ._util import NAME, Arg, Opt, app, debug_cli
 
 
 @debug_cli.command("profile")
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9481e53be..e3ca73cfb 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
 {%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
-{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
 [paths]
 train = null
 dev = null
@@ -28,7 +28,7 @@ lang = "{{ lang }}"
 tok2vec/transformer. #}
 {%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
 {%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
 {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 {%- else -%}
 {%- set full_pipeline = components -%}
@@ -127,6 +127,30 @@ grad_factor = 1.0
 @layers = "reduce_mean.v1"
 {% endif -%}
 
+{% if "span_finder" in components -%}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.span_finder.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
 {% if "spancat" in components -%}
 [components.spancat]
 factory = "spancat"
@@ -392,6 +416,27 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
 
+{% if "span_finder" in components %}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
 {% if "spancat" in components %}
 [components.spancat]
 factory = "spancat"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index cc22cbba6..8bdabd39c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,15 +1,23 @@
-from typing import Optional, Dict, Any, Union
-from pathlib import Path
-from wasabi import msg
-import typer
 import logging
 import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import typer
+from wasabi import msg
 
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-from ..training.loop import train as train_nlp
-from ..training.initialize import init_nlp
 from .. import util
+from ..training.initialize import init_nlp
+from ..training.loop import train as train_nlp
+from ._util import (
+    Arg,
+    Opt,
+    app,
+    import_code,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @app.command(
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index a918e9a39..0426f05fd 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -1,14 +1,21 @@
-from typing import Tuple
-from pathlib import Path
 import sys
-import requests
-from wasabi import msg, Printer
 import warnings
+from pathlib import Path
+from typing import Tuple
+
+import requests
+from wasabi import Printer, msg
 
-from ._util import app
 from .. import about
-from ..util import get_package_version, get_installed_models, get_minor_version
-from ..util import get_package_path, get_model_meta, is_compatible_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_model_meta,
+    get_package_path,
+    get_package_version,
+    is_compatible_version,
+)
+from ._util import app
 
 
 @app.command("validate")
diff --git a/spacy/compat.py b/spacy/compat.py
index 89132735d..522fa30dd 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -1,5 +1,6 @@
 """Helpers for Python and platform compatibility."""
 import sys
+
 from thinc.util import copy_array
 
 try:
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index ea6bba2c9..bde2d04fe 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from typing import Union, Iterable, Optional, Dict, Any, Callable
 import warnings
+from typing import Any, Callable, Dict, Iterable, Optional, Union
 
-from .render import DependencyRenderer, EntityRenderer, SpanRenderer
-from ..tokens import Doc, Span
 from ..errors import Errors, Warnings
-from ..util import is_in_jupyter
-from ..util import find_available_port
-
+from ..tokens import Doc, Span
+from ..util import find_available_port, is_in_jupyter
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 
 _html = {}
 RENDER_WRAPPER = None
@@ -68,7 +66,7 @@ def render(
     if jupyter or (jupyter is None and is_in_jupyter()):
         # return HTML rendered by IPython display()
         # See #4840 for details on span wrapper to disable mathjax
-        from IPython.core.display import display, HTML
+        from IPython.core.display import HTML, display
 
         return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
     return html
@@ -125,13 +123,17 @@ def app(environ, start_response):
     return [res]
 
 
-def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
+def parse_deps(
+    orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
+) -> Dict[str, Any]:
     """Generate dependency parse in {'words': [], 'arcs': []} format.
 
-    orig_doc (Doc): Document to parse.
+    orig_doc (Union[Doc, Span]): Document to parse.
     options (Dict[str, Any]): Dependency parse specific visualisation options.
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
+    if isinstance(orig_doc, Span):
+        orig_doc = orig_doc.as_doc()
     doc = Doc(orig_doc.vocab).from_bytes(
         orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
     )
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index f74222dc2..86869e3b8 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,15 +1,29 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-import uuid
 import itertools
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
 from ..util import escape_html, minify_html, registry
-from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
-from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
-from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
-from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
-from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
-from .templates import TPL_TITLE
+from .templates import (
+    TPL_DEP_ARCS,
+    TPL_DEP_SVG,
+    TPL_DEP_WORDS,
+    TPL_DEP_WORDS_LEMMA,
+    TPL_ENT,
+    TPL_ENT_RTL,
+    TPL_ENTS,
+    TPL_FIGURE,
+    TPL_KB_LINK,
+    TPL_PAGE,
+    TPL_SPAN,
+    TPL_SPAN_RTL,
+    TPL_SPAN_SLICE,
+    TPL_SPAN_SLICE_RTL,
+    TPL_SPAN_START,
+    TPL_SPAN_START_RTL,
+    TPL_SPANS,
+    TPL_TITLE,
+)
 
 DEFAULT_LANG = "en"
 DEFAULT_DIR = "ltr"
diff --git a/spacy/errors.py b/spacy/errors.py
index 526c4d0d6..24b60f8a3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,4 +1,5 @@
 import warnings
+
 from .compat import Literal
 
 
@@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
     W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
+    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
+            "key attribute for vectors, configure it through Vectors(attr=) or "
+            "'spacy init vectors --attr'")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -549,8 +553,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
-    E850 = ("The PretrainVectors objective currently only supports default "
-            "vectors, not {mode} vectors.")
+    E850 = ("The PretrainVectors objective currently only supports default or "
+            "floret vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
             "but found value of '{val}'.")
     E853 = ("Unsupported component factory name '{name}'. The character '.' is "
@@ -736,8 +740,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "model from a shortcut, which is obsolete as of spaCy v3.0. To "
             "load the model, use its full name instead:\n\n"
             "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models. If you "
-            "want to create a blank model, use spacy.blank: "
+            "models, see the models directory: https://spacy.io/models and if "
+            "you want to create a blank model, use spacy.blank: "
             "nlp = spacy.blank(\"{name}\")")
     E942 = ("Executing `after_{name}` callback failed. Expected the function to "
             "return an initialized nlp object but got: {value}. Maybe "
@@ -968,6 +972,13 @@ class Errors(metaclass=ErrorsWithCodes):
     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
              "or use `auto_select_port=True` to pick an available port automatically.")
     E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
+    E1052 = ("Unable to copy spans: the character offsets for the span at "
+             "index {i} in the span group do not align with the tokenization "
+             "in the target doc.")
+    E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
+             " 'min_length': {min_length}, 'max_length': {max_length}")
+    E1054 = ("The text, including whitespace, must match between reference and "
+             "predicted docs when training {component}.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/glossary.py b/spacy/glossary.py
index d2240fbba..1f628698b 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,4 +1,5 @@
 import warnings
+
 from .errors import Warnings
 
 
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 1d70a9b34..3ce3e4c33 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,3 +1,3 @@
+from .candidate import Candidate, get_candidates, get_candidates_batch
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, get_candidates, get_candidates_batch
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 942ce9dd0..9fc4c4e9d 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,6 +1,8 @@
-from .kb cimport KnowledgeBase
 from libcpp.vector cimport vector
+
 from ..typedefs cimport hash_t
+from .kb cimport KnowledgeBase
+
 
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index c89efeb03..4cd734f43 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,9 +1,12 @@
 # cython: infer_types=True, profile=True
 
 from typing import Iterable
+
 from .kb cimport KnowledgeBase
+
 from ..tokens import Span
 
+
 cdef class Candidate:
     """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
     to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd
index 1adeef8ae..263469546 100644
--- a/spacy/kb/kb.pxd
+++ b/spacy/kb/kb.pxd
@@ -2,8 +2,10 @@
 
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t
+
 from ..vocab cimport Vocab
 
+
 cdef class KnowledgeBase:
     cdef Pool mem
     cdef readonly Vocab vocab
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index ce4bc0138..a88e18e1f 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -2,12 +2,13 @@
 
 from pathlib import Path
 from typing import Iterable, Tuple, Union
+
 from cymem.cymem cimport Pool
 
-from .candidate import Candidate
+from ..errors import Errors
 from ..tokens import Span
 from ..util import SimpleFrozenList
-from ..errors import Errors
+from .candidate import Candidate
 
 
 cdef class KnowledgeBase:
diff --git a/spacy/kb/kb_in_memory.pxd b/spacy/kb/kb_in_memory.pxd
index 825a6bde9..08ec6b2a3 100644
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@@ -1,11 +1,11 @@
 """Knowledge-base for entity or concept linking."""
-from preshed.maps cimport PreshMap
-from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
 
+from ..structs cimport AliasC, KBEntryC
 from ..typedefs cimport hash_t
-from ..structs cimport KBEntryC, AliasC
 from .kb cimport KnowledgeBase
 
 ctypedef vector[KBEntryC] entry_vec
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 2a74d047b..e991f7720 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,23 +1,28 @@
 # cython: infer_types=True, profile=True
-from typing import Iterable, Callable, Dict, Any, Union
+from typing import Any, Callable, Dict, Iterable, Union
 
 import srsly
-from preshed.maps cimport PreshMap
-from cpython.exc cimport PyErr_SetFromErrno
-from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
-from libc.stdint cimport int32_t, int64_t
-from libcpp.vector cimport vector
 
-from pathlib import Path
+from cpython.exc cimport PyErr_SetFromErrno
+from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+
 import warnings
+from pathlib import Path
 
 from ..tokens import Span
+
 from ..typedefs cimport hash_t
-from ..errors import Errors, Warnings
+
 from .. import util
+from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList, ensure_path
+
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
+
 from .candidate import Candidate as Candidate
 
 
diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py
index 553fcbf4c..8bd73c7ad 100644
--- a/spacy/lang/af/__init__.py
+++ b/spacy/lang/af/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class AfrikaansDefaults(BaseDefaults):
diff --git a/spacy/lang/am/__init__.py b/spacy/lang/am/__init__.py
index ddae556d6..284823eaa 100644
--- a/spacy/lang/am/__init__.py
+++ b/spacy/lang/am/__init__.py
@@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
 
 
 class AmharicDefaults(BaseDefaults):
diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py
index 555a179fa..87447b054 100644
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA_UPPER,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)
 
 _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 
diff --git a/spacy/lang/am/tokenizer_exceptions.py b/spacy/lang/am/tokenizer_exceptions.py
index 9472fe918..1ccf996ca 100644
--- a/spacy/lang/am/tokenizer_exceptions.py
+++ b/spacy/lang/am/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
 
 _exc = {}
 
diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py
index 18c1f90ed..d50b0722c 100644
--- a/spacy/lang/ar/__init__.py
+++ b/spacy/lang/ar/__init__.py
@@ -1,8 +1,8 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
 
 
 class ArabicDefaults(BaseDefaults):
diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py
index f30204c02..cf03fc68e 100644
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA_UPPER,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)
 
 _suffixes = (
     LIST_PUNCT
diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py
index 7c385bef8..eb16876f5 100644
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
index 476898364..32949aa3e 100644
--- a/spacy/lang/az/__init__.py
+++ b/spacy/lang/az/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class AzerbaijaniDefaults(BaseDefaults):
diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py
index 73a5e2762..96fb7f020 100644
--- a/spacy/lang/az/lex_attrs.py
+++ b/spacy/lang/az/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 # Eleven, twelve etc. are written separate: on bir, on iki
 
 _num_words = [
diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py
index c9176b946..acca63ba1 100644
--- a/spacy/lang/bg/__init__.py
+++ b/spacy/lang/bg/__init__.py
@@ -1,12 +1,14 @@
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..punctuation import (
+    COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+    COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
 
 
 class BulgarianDefaults(BaseDefaults):
diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py
index bba3c74cd..0b7942aec 100644
--- a/spacy/lang/bg/lex_attrs.py
+++ b/spacy/lang/bg/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "нула",
     "едно",
diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py
index 0f484b778..89d466daf 100644
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@@ -4,8 +4,7 @@ References:
     (countries, occupations, fields of studies and more).
 """
 
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
 
 _exc = {}
 
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 6d0331e00..6a5d37f5b 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,10 +1,12 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
 from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class BengaliDefaults(BaseDefaults):
diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index becfe8d2a..ddb91cef1 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)
 
 _currency = r"\$¢£€¥฿৳"
 _quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index e666522b8..016bf0fc5 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index a3def660d..8b2f3e85a 100755
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,14 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from thinc.api import Model
 
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
 from .lemmatizer import CatalanLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class CatalanDefaults(BaseDefaults):
diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py
index be8b7a6ea..3e99da0e0 100644
--- a/spacy/lang/ca/lex_attrs.py
+++ b/spacy/lang/ca/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "zero",
     "un",
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
index 8e2f09828..6914f67a7 100755
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,9 +1,18 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import LIST_CURRENCY
-from ..char_classes import CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-from ..char_classes import merge_chars, _units
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    _units,
+    merge_chars,
+)
 
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 
diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py
index 917e07c93..16a4c6a81 100644
--- a/spacy/lang/ca/syntax_iterators.py
+++ b/spacy/lang/ca/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN
+from typing import Iterator, Tuple, Union
+
 from ...errors import Errors
+from ...symbols import NOUN, PROPN
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
index b261b3498..67165780e 100755
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py
index 3e70e4078..9ea60afdf 100644
--- a/spacy/lang/cs/__init__.py
+++ b/spacy/lang/cs/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class CzechDefaults(BaseDefaults):
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index e148a7b4f..372f372dd 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class DanishDefaults(BaseDefaults):
diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py
index 403af686c..8e0420912 100644
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 # Source http://fjern-uv.dk/tal.php
 _num_words = """nul
 en et to tre fire fem seks syv otte ni ti
diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py
index e050ab7aa..f70fe3d64 100644
--- a/spacy/lang/da/punctuation.py
+++ b/spacy/lang/da/punctuation.py
@@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 from ..punctuation import TOKENIZER_SUFFIXES
 
-
 _quotes = CONCAT_QUOTES.replace("'", "")
 
 _infixes = (
diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py
index a0b70f004..60224f0b1 100644
--- a/spacy/lang/da/syntax_iterators.py
+++ b/spacy/lang/da/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from typing import Iterator, Tuple, Union
+
 from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index ce25c546b..649d12022 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -2,10 +2,9 @@
 Tokenizer Exceptions.
 Source: https://forkortelse.dk/ and various others.
 """
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 65863c098..4f45b2357 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class GermanDefaults(BaseDefaults):
diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py
index 69d402237..862207649 100644
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@@ -1,9 +1,18 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    UNITS,
+)
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 
-
 _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
 
 _suffixes = (
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index e80504998..544fe299c 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 21d99cffe..3f1aeeccd 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {
     "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
diff --git a/spacy/lang/dsb/__init__.py b/spacy/lang/dsb/__init__.py
index c66092a0c..096eced19 100644
--- a/spacy/lang/dsb/__init__.py
+++ b/spacy/lang/dsb/__init__.py
@@ -1,6 +1,6 @@
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class LowerSorbianDefaults(BaseDefaults):
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 53dd9be8e..00e52bd97 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
 
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
 from .lemmatizer import GreekLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class GreekDefaults(BaseDefaults):
diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py
index 369973cc0..10b54d112 100644
--- a/spacy/lang/el/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/get_pos_from_wiktionary.py
@@ -1,5 +1,6 @@
 def get_pos_from_wiktionary():
     import re
+
     from gensim.corpora.wikicorpus import extract_pages
 
     regex = re.compile(r"==={{(\w+)\|el}}===")
diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py
index 2d5690407..b8b717bac 100644
--- a/spacy/lang/el/punctuation.py
+++ b/spacy/lang/el/punctuation.py
@@ -1,6 +1,16 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES, CURRENCY
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    HYPHENS,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+)
 
 _units = (
     "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 18fa46695..31c7dccf7 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py
index 0a36d5d2b..41317ba97 100644
--- a/spacy/lang/el/tokenizer_exceptions.py
+++ b/spacy/lang/el/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 876186979..c4bcfb938 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
 
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
 from .lemmatizer import EnglishLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class EnglishDefaults(BaseDefaults):
diff --git a/spacy/lang/en/punctuation.py b/spacy/lang/en/punctuation.py
index 5d3eb792e..775c6b001 100644
--- a/spacy/lang/en/punctuation.py
+++ b/spacy/lang/en/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 
 _infixes = (
     LIST_ELLIPSES
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 7904e5621..140ae0a5c 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 7886e28cb..dd3650c18 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -1,8 +1,8 @@
 from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc
 
+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc: Dict[str, List[Dict]] = {}
 _exclude = [
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index e75955202..bcaed8672 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+
+from ...language import BaseDefaults, Language
 from .lemmatizer import SpanishLemmatizer
-from .syntax_iterators import SYNTAX_ITERATORS
+from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class SpanishDefaults(BaseDefaults):
diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py
index ca5fc08c8..44f968347 100644
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import List, Optional, Tuple
 import re
+from typing import List, Optional, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py
index 9d1fa93b8..4c477eaee 100644
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "cero",
     "uno",
diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py
index e9552371e..3d20518cd 100644
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@@ -1,8 +1,17 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
-from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import merge_chars
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    LIST_UNITS,
+    PUNCT,
+    merge_chars,
+)
 
 _list_units = [u for u in LIST_UNITS if u != "%"]
 _units = merge_chars(" ".join(_list_units))
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index f2ca2a678..96df444a3 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 74cdc143d..2ea0ed8b7 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {
     "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py
index 274bc1309..9ec7e6006 100644
--- a/spacy/lang/et/__init__.py
+++ b/spacy/lang/et/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class EstonianDefaults(BaseDefaults):
diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py
index 3346468bd..81f9c4a18 100644
--- a/spacy/lang/eu/__init__.py
+++ b/spacy/lang/eu/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class BasqueDefaults(BaseDefaults):
diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py
index 5d35d0a25..382bfc75c 100644
--- a/spacy/lang/eu/punctuation.py
+++ b/spacy/lang/eu/punctuation.py
@@ -1,4 +1,3 @@
 from ..punctuation import TOKENIZER_SUFFIXES
 
-
 _suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 914e4c27d..e5baa8b4a 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_SUFFIXES
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
 from ...pipeline import Lemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class PersianDefaults(BaseDefaults):
diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py
index 99b8e2787..065e81bd6 100644
--- a/spacy/lang/fa/lex_attrs.py
+++ b/spacy/lang/fa/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 MIM = "م"
 ZWNJ_O_MIM = "‌ام"
 YE_NUN = "ین"
diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py
index 4b258c13d..c1ee570ce 100644
--- a/spacy/lang/fa/punctuation.py
+++ b/spacy/lang/fa/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA_UPPER,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)
 
 _suffixes = (
     LIST_PUNCT
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 8207884b0..3052369a7 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+from typing import Iterator, Tuple, Union
+
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py
index 30df798ab..3b31b7f67 100644
--- a/spacy/lang/fa/tokenizer_exceptions.py
+++ b/spacy/lang/fa/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
 
 TOKENIZER_EXCEPTIONS = {
     ".ق ": [{ORTH: ".ق "}],
diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index c3a0cf451..3e371b9b5 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class FinnishDefaults(BaseDefaults):
diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py
index 4d500cead..9eec41b3d 100644
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "nolla",
     "yksi",
diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py
index 6e14dde38..29ddc3111 100644
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@@ -1,8 +1,14 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    LIST_ELLIPSES,
+    LIST_HYPHENS,
+    LIST_ICONS,
+)
 from ..punctuation import TOKENIZER_SUFFIXES
 
-
 _quotes = CONCAT_QUOTES.replace("'", "")
 DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
 
diff --git a/spacy/lang/fi/syntax_iterators.py b/spacy/lang/fi/syntax_iterators.py
index 6b481e51f..6e2216713 100644
--- a/spacy/lang/fi/syntax_iterators.py
+++ b/spacy/lang/fi/syntax_iterators.py
@@ -1,7 +1,8 @@
 from typing import Iterator, Tuple, Union
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index 465333b0a..881d5b91d 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 27d2a915e..a8bc7f53e 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -1,15 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from thinc.api import Model
 
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
+from ...language import BaseDefaults, Language
 from .lemmatizer import FrenchLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
 
 
 class FrenchDefaults(BaseDefaults):
diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py
index 811312ad7..9cf508a07 100644
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = set(
     """
 zero un une deux trois quatre cinq six sept huit neuf dix
diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py
index 873d01d87..a3b178a2f 100644
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@@ -1,8 +1,16 @@
-from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..char_classes import merge_chars
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+    merge_chars,
+)
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 
 ELISION = "' ’".replace(" ", "")
 HYPHENS = r"- – — ‐ ‑".replace(" ", "")
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 5849c40b3..a6bf3d3ca 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 2e88b58cf..fa2062ef9 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -1,11 +1,10 @@
 import re
 
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from .punctuation import ELISION, HYPHENS
-from ..char_classes import ALPHA_LOWER, ALPHA
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..char_classes import ALPHA, ALPHA_LOWER
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .punctuation import ELISION, HYPHENS
 
 # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
 # from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 3be53bc7a..6f9a27a14 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -2,10 +2,10 @@ from typing import Optional
 
 from thinc.api import Model
 
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
 from .lemmatizer import IrishLemmatizer
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class IrishDefaults(BaseDefaults):
diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py
index 47aec8fd4..c9fbfbc19 100644
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 63af65fe9..eb4b413fb 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {
     "'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}],
diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py
index 019b3802e..ed742f4c5 100644
--- a/spacy/lang/grc/__init__.py
+++ b/spacy/lang/grc/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class AncientGreekDefaults(BaseDefaults):
diff --git a/spacy/lang/grc/lex_attrs.py b/spacy/lang/grc/lex_attrs.py
index 0ab15e6fd..33cfca05b 100644
--- a/spacy/lang/grc/lex_attrs.py
+++ b/spacy/lang/grc/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     # CARDINALS
     "εἷς",
diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
index 8f3589e9a..8e9fc8bf2 100644
--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@@ -1,6 +1,15 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+)
 
 _prefixes = (
     [
diff --git a/spacy/lang/grc/tokenizer_exceptions.py b/spacy/lang/grc/tokenizer_exceptions.py
index bcee70f32..86527ff61 100644
--- a/spacy/lang/grc/tokenizer_exceptions.py
+++ b/spacy/lang/grc/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py
index e6fbc9d18..2f22034c1 100644
--- a/spacy/lang/gu/__init__.py
+++ b/spacy/lang/gu/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class GujaratiDefaults(BaseDefaults):
diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py
index dd2ee478d..07084acf1 100644
--- a/spacy/lang/he/__init__.py
+++ b/spacy/lang/he/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class HebrewDefaults(BaseDefaults):
diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py
index 4c8ae446d..980dc31c1 100644
--- a/spacy/lang/hi/__init__.py
+++ b/spacy/lang/hi/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class HindiDefaults(BaseDefaults):
diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py
index ee845e8b1..4ecd1db66 100644
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@@ -1,6 +1,5 @@
+from ...attrs import LIKE_NUM, NORM
 from ..norm_exceptions import BASE_NORMS
-from ...attrs import NORM, LIKE_NUM
-
 
 # fmt: off
 _stem_suffixes = [
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
index 30870b522..fd7622a3d 100644
--- a/spacy/lang/hr/__init__.py
+++ b/spacy/lang/hr/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class CroatianDefaults(BaseDefaults):
diff --git a/spacy/lang/hsb/__init__.py b/spacy/lang/hsb/__init__.py
index 034d82319..e8b2ffc9f 100644
--- a/spacy/lang/hsb/__init__.py
+++ b/spacy/lang/hsb/__init__.py
@@ -1,7 +1,7 @@
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
 
 
 class UpperSorbianDefaults(BaseDefaults):
diff --git a/spacy/lang/hsb/tokenizer_exceptions.py b/spacy/lang/hsb/tokenizer_exceptions.py
index 4b9a4f98a..cd3bac913 100644
--- a/spacy/lang/hsb/tokenizer_exceptions.py
+++ b/spacy/lang/hsb/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = dict()
 for exc_data in [
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index 9426bacea..799e6d230 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -1,7 +1,7 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
 
 
 class HungarianDefaults(BaseDefaults):
diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index f827cd677..dbf93c622 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
-from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_ICONS,
+    CONCAT_QUOTES,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)
 
 # removing ° from the special icons to keep e.g. 99° as one token
 _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index ffaa74f50..3f79b02d2 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -1,10 +1,9 @@
 import re
 
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import ALPHA_LOWER, CURRENCY
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..punctuation import ALPHA_LOWER, CURRENCY
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py
index 481eaae0a..e00d4fd11 100644
--- a/spacy/lang/hy/__init__.py
+++ b/spacy/lang/hy/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class ArmenianDefaults(BaseDefaults):
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index 9c9c0380c..4c96b8ab5 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "զրո",
     "մեկ",
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index 0d72cfa9d..93eb3214a 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -1,9 +1,9 @@
-from .stop_words import STOP_WORDS
-from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class IndonesianDefaults(BaseDefaults):
diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py
index 3167f4659..5952c4d06 100644
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@@ -1,8 +1,7 @@
 import unicodedata
 
-from .punctuation import LIST_CURRENCY
 from ...attrs import IS_CURRENCY, LIKE_NUM
-
+from .punctuation import LIST_CURRENCY
 
 _num_words = [
     "nol",
diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py
index f6c2387d8..8303b8eaa 100644
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@@ -1,6 +1,5 @@
-from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
-
+from ..char_classes import ALPHA, _currency, _units, merge_chars, split_chars
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 
 _units = (
     _units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index fa984d411..027798687 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py
index ff77ede9f..8dea4e97f 100644
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
+from ...symbols import NORM, ORTH
+from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc
-
 
 # Daftar singkatan dan Akronim dari:
 # https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py
index 318363beb..af1260045 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/is/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class IcelandicDefaults(BaseDefaults):
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index ecf322bd7..14458d811 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -1,12 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
 
-from .stop_words import STOP_WORDS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
 from .lemmatizer import ItalianLemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
+from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class ItalianDefaults(BaseDefaults):
diff --git a/spacy/lang/it/lemmatizer.py b/spacy/lang/it/lemmatizer.py
index e44e64e3a..bf869166d 100644
--- a/spacy/lang/it/lemmatizer.py
+++ b/spacy/lang/it/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py
index f01ab4f0d..51318b22d 100644
--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@@ -1,8 +1,13 @@
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
-
 
 ELISION = "'’"
 
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
index f63df3fad..924627648 100644
--- a/spacy/lang/it/syntax_iterators.py
+++ b/spacy/lang/it/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py
index 42883863b..2e7a5a1a3 100644
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {
     "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index bf86305fb..0d5f97ac8 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -1,27 +1,27 @@
-from typing import Optional, Union, Dict, Any, Callable
-from pathlib import Path
-import srsly
-from collections import namedtuple
-from thinc.api import Model
 import re
+from collections import namedtuple
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
 
-from .stop_words import STOP_WORDS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .tag_map import TAG_MAP
-from .tag_orth_map import TAG_ORTH_MAP
-from .tag_bigram_map import TAG_BIGRAM_MAP
+import srsly
+from thinc.api import Model
+
+from ... import util
 from ...errors import Errors
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
 from ...pipeline import Morphologizer
 from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
 from ...scorer import Scorer
 from ...symbols import POS
 from ...tokens import Doc, MorphAnalysis
 from ...training import validate_examples
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...util import DummyTokenizer, load_config_from_str, registry
 from ...vocab import Vocab
-from ... import util
-
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tag_bigram_map import TAG_BIGRAM_MAP
+from .tag_map import TAG_MAP
+from .tag_orth_map import TAG_ORTH_MAP
 
 DEFAULT_CONFIG = """
 [nlp]
diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py
index 588a9ba03..34670083e 100644
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@@ -1,9 +1,8 @@
-from typing import Union, Iterator, Tuple, Set
+from typing import Iterator, Set, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON, VERB
+from ...symbols import NOUN, PRON, PROPN, VERB
 from ...tokens import Doc, Span
 
-
 # TODO: this can probably be pruned a bit
 # fmt: off
 labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index c6de3831a..5c14f41bf 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -1,6 +1,23 @@
-from ...symbols import POS, PUNCT, INTJ, ADJ, AUX, ADP, PART, SCONJ, NOUN
-from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE, CCONJ
-
+from ...symbols import (
+    ADJ,
+    ADP,
+    ADV,
+    AUX,
+    CCONJ,
+    DET,
+    INTJ,
+    NOUN,
+    NUM,
+    PART,
+    POS,
+    PRON,
+    PROPN,
+    PUNCT,
+    SCONJ,
+    SPACE,
+    SYM,
+    VERB,
+)
 
 TAG_MAP = {
     # Explanation of Unidic tags:
diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py
index ccd46a394..44d53f6b7 100644
--- a/spacy/lang/kn/__init__.py
+++ b/spacy/lang/kn/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class KannadaDefaults(BaseDefaults):
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 0e02e4a2d..e2c860f7d 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,17 +1,16 @@
-from typing import Iterator, Any, Dict
+from typing import Any, Dict, Iterator
 
+from ...language import BaseDefaults, Language
+from ...scorer import Scorer
+from ...symbols import POS, X
+from ...tokens import Doc
+from ...training import validate_examples
+from ...util import DummyTokenizer, load_config_from_str, registry
+from ...vocab import Vocab
+from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
-from ...tokens import Doc
-from ...scorer import Scorer
-from ...symbols import POS, X
-from ...training import validate_examples
-from ...util import DummyTokenizer, registry, load_config_from_str
-from ...vocab import Vocab
-
 
 DEFAULT_CONFIG = """
 [nlp]
diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py
index ac5bc7e48..2c49aa389 100644
--- a/spacy/lang/ko/lex_attrs.py
+++ b/spacy/lang/ko/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "영",
     "공",
diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py
index f5f1c51da..c3c32ea1f 100644
--- a/spacy/lang/ko/punctuation.py
+++ b/spacy/lang/ko/punctuation.py
@@ -1,7 +1,6 @@
 from ..char_classes import LIST_QUOTES
 from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
 
-
 _infixes = (
     ["·", "ㆍ", r"\(", r"\)"]
     + [r"(?<=[0-9])~(?=[0-9-])"]
diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py
index 26a8c56b9..85598c3ef 100644
--- a/spacy/lang/ko/tag_map.py
+++ b/spacy/lang/ko/tag_map.py
@@ -1,5 +1,21 @@
-from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
-from ...symbols import VERB, ADV, PROPN, NUM, DET
+from ...symbols import (
+    ADJ,
+    ADP,
+    ADV,
+    AUX,
+    CONJ,
+    DET,
+    INTJ,
+    NOUN,
+    NUM,
+    POS,
+    PRON,
+    PROPN,
+    PUNCT,
+    SYM,
+    VERB,
+    X,
+)
 
 # 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
 # https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index ccca384bd..fafc0f020 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
 
 
 class KyrgyzDefaults(BaseDefaults):
diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
index fa9819f80..6d89da2f7 100644
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 
 _hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
 _infixes = (
diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
index 8ec727ac1..c93e3dac3 100644
--- a/spacy/lang/ky/tokenizer_exceptions.py
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py
index 15b87c5b9..d77ae267e 100644
--- a/spacy/lang/la/__init__.py
+++ b/spacy/lang/la/__init__.py
@@ -1,13 +1,15 @@
-from ...language import Language, BaseDefaults
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class LatinDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     stop_words = STOP_WORDS
     lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Latin(Language):
diff --git a/spacy/lang/la/examples.py b/spacy/lang/la/examples.py
new file mode 100644
index 000000000..db8550070
--- /dev/null
+++ b/spacy/lang/la/examples.py
@@ -0,0 +1,22 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.la.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+# > Caes. BG 1.1
+# > Cic. De Amic. 1
+# > V. Georg. 1.1-5
+# > Gen. 1:1
+# > Galileo, Sid. Nunc.
+# > van Schurman, Opusc. arg. 1
+
+sentences = [
+    "Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
+    "Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
+    "Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
+    "In principio creavit Deus caelum et terram.",
+    "Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
+    "Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
+]
diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py
index 9efb4dd3c..fcb35defc 100644
--- a/spacy/lang/la/lex_attrs.py
+++ b/spacy/lang/la/lex_attrs.py
@@ -1,22 +1,22 @@
-from ...attrs import LIKE_NUM
 import re
 
+from ...attrs import LIKE_NUM
+
 # cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
 roman_numerals_compile = re.compile(
     r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
 )
 
-_num_words = set(
-    """
-unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
+_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
 """.split()
-)
 
-_ordinal_words = set(
-    """
-primus prima primum secundus secunda secundum tertius tertia tertium
-""".split()
-)
+_num_words += [item.replace("v", "u") for item in _num_words]
+_num_words = set(_num_words)
+
+_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
+
+_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
+_ordinal_words = set(_ordinal_words)
 
 
 def like_num(text):
diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py
new file mode 100644
index 000000000..39b4fb39d
--- /dev/null
+++ b/spacy/lang/la/syntax_iterators.py
@@ -0,0 +1,86 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span
+
+# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    def is_verb_token(tok):
+        return tok.pos in [VERB, AUX]
+
+    def get_left_bound(root):
+        left_bound = root
+        for tok in reversed(list(root.lefts)):
+            if tok.dep in np_left_deps:
+                left_bound = tok
+        return left_bound
+
+    def get_right_bound(doc, root):
+        right_bound = root
+        for tok in root.rights:
+            if tok.dep in np_right_deps:
+                right = get_right_bound(doc, tok)
+                if list(
+                    filter(
+                        lambda t: is_verb_token(t) or t.dep in stop_deps,
+                        doc[root.i : right.i],
+                    )
+                ):
+                    break
+                else:
+                    right_bound = right
+        return right_bound
+
+    def get_bounds(doc, root):
+        return get_left_bound(root), get_right_bound(doc, root)
+
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    if not len(doc):
+        return
+
+    left_labels = [
+        "det",
+        "fixed",
+        "nmod:poss",
+        "amod",
+        "flat",
+        "goeswith",
+        "nummod",
+        "appos",
+    ]
+    right_labels = [
+        "fixed",
+        "nmod:poss",
+        "amod",
+        "flat",
+        "goeswith",
+        "nummod",
+        "appos",
+        "nmod",
+        "det",
+    ]
+    stop_labels = ["punct"]
+
+    np_label = doc.vocab.strings.add("NP")
+    np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
+    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
+    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+
+    prev_right = -1
+    for token in doclike:
+        if token.pos in [PROPN, NOUN, PRON]:
+            left, right = get_bounds(doc, token)
+            if left.i <= prev_right:
+                continue
+            yield left.i, right.i + 1, np_label
+            prev_right = right.i
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py
index 060f6e085..c0b98116f 100644
--- a/spacy/lang/la/tokenizer_exceptions.py
+++ b/spacy/lang/la/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 ## TODO: Look into systematically handling u/v
 _exc = {
@@ -12,65 +11,15 @@ _exc = {
     "uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
 }
 
-for orth in [
-    "A.",
-    "Agr.",
-    "Ap.",
-    "C.",
-    "Cn.",
-    "D.",
-    "F.",
-    "K.",
-    "L.",
-    "M'.",
-    "M.",
-    "Mam.",
-    "N.",
-    "Oct.",
-    "Opet.",
-    "P.",
-    "Paul.",
-    "Post.",
-    "Pro.",
-    "Q.",
-    "S.",
-    "Ser.",
-    "Sert.",
-    "Sex.",
-    "St.",
-    "Sta.",
-    "T.",
-    "Ti.",
-    "V.",
-    "Vol.",
-    "Vop.",
-    "U.",
-    "Uol.",
-    "Uop.",
-    "Ian.",
-    "Febr.",
-    "Mart.",
-    "Apr.",
-    "Mai.",
-    "Iun.",
-    "Iul.",
-    "Aug.",
-    "Sept.",
-    "Oct.",
-    "Nov.",
-    "Nou.",
-    "Dec.",
-    "Non.",
-    "Id.",
-    "A.D.",
-    "Coll.",
-    "Cos.",
-    "Ord.",
-    "Pl.",
-    "S.C.",
-    "Suff.",
-    "Trib.",
-]:
+_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
+
+_abbrev_exc += [item.lower() for item in _abbrev_exc]
+_abbrev_exc += [item.upper() for item in _abbrev_exc]
+_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
+
+_abbrev_exc += ["d.N."]
+
+for orth in set(_abbrev_exc):
     _exc[orth] = [{ORTH: orth}]
 
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index 7827e7762..2386b4356 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class LuxembourgishDefaults(BaseDefaults):
diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py
index d2d50d9dc..119231374 100644
--- a/spacy/lang/lb/lex_attrs.py
+++ b/spacy/lang/lb/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = set(
     """
 null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py
index e382c56c5..8bdbf9713 100644
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@@ -1,4 +1,4 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES, LIST_ICONS
 
 ELISION = " ' ’ ".strip().replace(" ", "")
 
diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py
index d00dc9610..844826e27 100644
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 # TODO
 # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 6ed981a06..3ac20420d 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -1,11 +1,10 @@
-from typing import Set
-import unicodedata
 import re
+import unicodedata
+from typing import Set
 
 from .. import attrs
 from .tokenizer_exceptions import URL_MATCH
 
-
 _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
 _tlds = set(
     "com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
diff --git a/spacy/lang/lg/__init__.py b/spacy/lang/lg/__init__.py
index 6f7153fce..a87685375 100644
--- a/spacy/lang/lg/__init__.py
+++ b/spacy/lang/lg/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class LugandaDefaults(BaseDefaults):
diff --git a/spacy/lang/lg/punctuation.py b/spacy/lang/lg/punctuation.py
index 5d3eb792e..775c6b001 100644
--- a/spacy/lang/lg/punctuation.py
+++ b/spacy/lang/lg/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 
 _infixes = (
     LIST_ELLIPSES
diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py
index b7e11f77e..3b8e972c6 100644
--- a/spacy/lang/lij/__init__.py
+++ b/spacy/lang/lij/__init__.py
@@ -1,7 +1,7 @@
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
 
 
 class LigurianDefaults(BaseDefaults):
diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py
index d50b75589..c5c150d0a 100644
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@@ -1,6 +1,5 @@
-from ..punctuation import TOKENIZER_INFIXES
 from ..char_classes import ALPHA
-
+from ..punctuation import TOKENIZER_INFIXES
 
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 
diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py
index 52eae2c89..cf5a1af66 100644
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py
index 3ae000e5f..f3ea257b1 100644
--- a/spacy/lang/lt/__init__.py
+++ b/spacy/lang/lt/__init__.py
@@ -1,8 +1,8 @@
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class LithuanianDefaults(BaseDefaults):
diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py
index 22aee0941..deef24854 100644
--- a/spacy/lang/lt/punctuation.py
+++ b/spacy/lang/lt/punctuation.py
@@ -1,9 +1,14 @@
-from ..char_classes import LIST_ICONS, LIST_ELLIPSES
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import HYPHENS
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 from ..punctuation import TOKENIZER_SUFFIXES
 
-
 _infixes = (
     LIST_ELLIPSES
     + LIST_ICONS
diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py
index 118fb2190..d39b86dfc 100644
--- a/spacy/lang/lt/tokenizer_exceptions.py
+++ b/spacy/lang/lt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py
index a05e5b939..fdfca5e97 100644
--- a/spacy/lang/lv/__init__.py
+++ b/spacy/lang/lv/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class LatvianDefaults(BaseDefaults):
diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py
index fa07cfef9..413f0038d 100644
--- a/spacy/lang/mk/__init__.py
+++ b/spacy/lang/mk/__init__.py
@@ -1,15 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
+
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...lookups import Lookups
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .lemmatizer import MacedonianLemmatizer
+from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
-from ...lookups import Lookups
 
 
 class MacedonianDefaults(BaseDefaults):
diff --git a/spacy/lang/mk/lemmatizer.py b/spacy/lang/mk/lemmatizer.py
index a792095e7..f5a5eca85 100644
--- a/spacy/lang/mk/lemmatizer.py
+++ b/spacy/lang/mk/lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import List
 from collections import OrderedDict
+from typing import List
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
diff --git a/spacy/lang/mk/tokenizer_exceptions.py b/spacy/lang/mk/tokenizer_exceptions.py
index 3b589b2a9..40f2c1d80 100644
--- a/spacy/lang/mk/tokenizer_exceptions.py
+++ b/spacy/lang/mk/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
 
 _exc = {}
 
diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py
index 9f90605f0..0b17b8a7a 100644
--- a/spacy/lang/ml/__init__.py
+++ b/spacy/lang/ml/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class MalayalamDefaults(BaseDefaults):
diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py
index 9ac19b6a7..33a144f6b 100644
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 # reference 2: https://www.omniglot.com/language/numbers/malayalam.htm
 
 _num_words = [
diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py
index 3e172fa60..f980efbd0 100644
--- a/spacy/lang/mr/__init__.py
+++ b/spacy/lang/mr/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class MarathiDefaults(BaseDefaults):
diff --git a/spacy/lang/ms/__init__.py b/spacy/lang/ms/__init__.py
new file mode 100644
index 000000000..f53ebfcf2
--- /dev/null
+++ b/spacy/lang/ms/__init__.py
@@ -0,0 +1,24 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+class MalayDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
+    syntax_iterators = SYNTAX_ITERATORS
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+
+
+class Malay(Language):
+    lang = "ms"
+    Defaults = MalayDefaults
+
+
+__all__ = ["Malay"]
diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py
new file mode 100644
index 000000000..fba1dd70f
--- /dev/null
+++ b/spacy/lang/ms/_tokenizer_exceptions_list.py
@@ -0,0 +1,1943 @@
+# from https://prpm.dbp.gov.my/cari1?keyword=
+# dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka
+MS_BASE_EXCEPTIONS = set(
+    """
+aba-aba
+abah-abah
+abar-abar
+abrit-abritan
+abu-abu
+abuk-abuk
+abun-abun
+acak-acak
+acak-acakan
+acang-acang
+aci-aci
+aci-acian
+aci-acinya
+adang-adang
+adap-adapan
+adik-beradik
+aduk-adukan
+agak-agak
+agar-agar
+agut-agut
+air-cooled
+ajar-ajar
+aji-aji
+akal-akal
+akhir-akhir
+aki-aki
+alah-mengalahi
+alan-alan
+alang-alang
+alang-alangan
+alap-alap
+ali-ali
+alih-alih
+aling-aling
+aling-alingan
+alip-alipan
+alon-alon
+alu-alu
+alu-aluan
+alun-alun
+alur-alur
+ambah-ambah
+ambai-ambai
+ambil-mengambil
+ambring-ambringan
+ambu-ambu
+ambung-ambung
+amin-amin
+ampai-ampai
+amung-amung
+anai-anai
+anak-anak
+anak-anakan
+anak-beranak
+ancak-ancak
+ancang-ancang
+andang-andang
+angan-angan
+anggar-anggar
+angin-angin
+angin-anginan
+angkul-angkul
+angkup-angkup
+angkut-angkut
+ani-ani
+aning-aning
+anjang-anjang
+anjing-anjing
+anjung-anjung
+anjung-anjungan
+antar-antar
+ante-mortem
+anting-anting
+antung-antung
+anyam-menganyam
+apa-apa
+api-api
+apit-apit
+aprit-apritan
+arah-arah
+arak-arakan
+aram-aram
+ari-ari
+aru-aru
+asa-asaan
+asam-asaman
+asuh-asuh
+atas-mengatasi
+ati-ati
+audio-visual
+avant-garde
+awang-awang
+awang-gemawang
+ayak-ayak
+ayam-ayam
+ayam-ayaman
+ayang-ayang
+ayeng-ayengan
+ayun-temayun
+back-up
+bahu-membahu
+baik-baik
+bajang-bajang
+baji-baji
+balai-balai
+balam-balam
+balas-membalas
+baling-baling
+balut-balut
+bangun-bangun
+bantal-bantal
+barat-barat
+barau-barau
+bari-bari
+barung-barung
+basa-basi
+bata-bata
+batir-batir
+bau-bauan
+bayang-bayang
+bedil-bedal
+begana-begini
+bekal-bekalan
+belat-belit
+belu-belai
+benggal-benggil
+bengkal-bengkil
+bengkang-bengkok
+bengkang-bengkong
+berabad-abad
+berabun-rabun
+berada-ada
+beragah-agah
+beragak-agak
+beragam-ragam
+beraja-raja
+berakit-rakit
+beraku-akuan
+beralun-alun
+beramah-ramahan
+beramah-tamah
+beramai-ramai
+berambai-ambai
+berambal-ambalan
+beramuk-amukan
+berandai-andai
+berandai-randai
+berang-berang
+berangan-angan
+beranggap-anggapan
+berangguk-angguk
+berangin-angin
+berangka-angka
+berangka-angkaan
+berangkai-rangkai
+beranja-anja
+berantai-rantai
+berapi-api
+berapung-apung
+berarak-arakan
+beras-beras
+berasing-asingan
+beratus-ratus
+berawas-awas
+berayal-ayalan
+berayun-ayun
+berbagai-bagai
+berbahas-bahasan
+berbalas-balasan
+berbalik-balik
+berbanjar-banjar
+berbantah-bantah
+berbanyak-banyak
+berbarik-barik
+berbasah-basah
+berbatu-batu
+berbayang-bayang
+berbecak-becak
+berbedil-bedilan
+berbeka-beka
+berbelakang-belakangan
+berbelang-belang
+berbeli-belian
+berbelit-belit
+berbelok-belok
+berbenar-benar
+berbencah-bencah
+berbesar-besar
+berbidai-bidai
+berbiku-biku
+berbilik-bilik
+berbinar-binar
+berbincang-bincang
+berbingkah-bingkah
+berbintang-bintang
+berbintik-bintik
+berbintil-bintil
+berbisik-bisik
+berbolak-balik
+berbolong-bolong
+berbondong-bondong
+berbongkah-bongkah
+berbuai-buai
+berbual-bual
+berbukit-bukit
+berbulan-bulan
+berbunga-bunga
+berbuntut-buntut
+berbunuh-bunuhan
+berburu-buru
+berburuk-buruk
+berbutir-butir
+bercabang-cabang
+bercaci-cacian
+bercakap-cakap
+bercakar-cakaran
+bercantik-cantik
+bercari-cari
+bercari-carian
+bercarik-carik
+bercepat-cepat
+bercerai-berai
+bercerai-cerai
+bercetai-cetai
+bercikun-cikun
+bercinta-cintaan
+bercita-cita
+berciut-ciut
+berconteng-conteng
+bercoreng-coreng
+bercoreng-moreng
+bercuit-cuit
+bercumbu-cumbu
+bercumbu-cumbuan
+bercura-bura
+bercura-cura
+berdada-dadaan
+berdahulu-dahuluan
+berdalam-dalam
+berdebar-debar
+berdecap-decap
+berdedai-dedai
+berdegap-degap
+berdegar-degar
+berdeham-deham
+berdekah-dekah
+berdekat-dekat
+berdelat-delat
+berdembun-dembun
+berdempang-dempang
+berdendam-dendaman
+berdengkang-dengkang
+berdentang-dentang
+berdentum-dentum
+berdentung-dentung
+berdepak-depak
+berdepan-depan
+berderai-derai
+berderak-derak
+berderau-derau
+berdering-dering
+berderung-derung
+berdesak-desakan
+berdesing-desing
+berdesus-desus
+berdikit-dikit
+berdingkit-dingkit
+berdua-dua
+berduri-duri
+berduru-duru
+berduyun-duyun
+berebut-rebut
+berebut-rebutan
+beregang-regang
+berek-berek
+berembut-rembut
+berempat-empat
+berenak-enak
+berenteng-renteng
+beresah-resah
+berfoya-foya
+bergagah-gagahan
+bergagap-gagap
+bergalur-galur
+berganda-ganda
+berganti-ganti
+bergarah-garah
+bergaruk-garuk
+bergegas-gegas
+bergelang-gelang
+bergelap-gelap
+bergelas-gelasan
+bergeleng-geleng
+bergemal-gemal
+bergembut-gembut
+bergerek-gerek
+bergesa-gesa
+bergilir-gilir
+bergolek-golek
+bergores-gores
+bergotong-royong
+bergugus-gugus
+bergulung-gulung
+bergulut-gulut
+bergumpal-gumpal
+bergunung-gunung
+berhadap-hadapan
+berhamun-hamun
+berhandai-handai
+berhanyut-hanyut
+berhari-hari
+berhati-hati
+berhilau-hilau
+berhujan-hujan
+beria-ia
+beria-ria
+beriak-riak
+beribu-ribu
+berigi-rigi
+bering-bering
+beringat-ingat
+beringgit-ringgit
+berintik-rintik
+beriring-iring
+beriring-iringan
+berjabir-jabir
+berjaga-jaga
+berjagung-jagung
+berjalan-jalan
+berjalar-jalar
+berjalin-jalin
+berjalur-jalur
+berjam-jam
+berjauh-jauhan
+berjejal-jejal
+berjela-jela
+berjenis-jenis
+berjenjang-jenjang
+berjilid-jilid
+berjinak-jinak
+berjingkat-jingkat
+berjingkrak-jingkrak
+berjongkok-jongkok
+berjubel-jubel
+berjujut-jujutan
+berjulai-julai
+berjumbai-jumbai
+berjurai-jurai
+berjurus-jurus
+berjuta-juta
+berkaca-kaca
+berkait-kaitan
+berkala-kala
+berkali-kali
+berkanjar-kanjar
+berkaok-kaok
+berkarung-karung
+berkasih-kasihan
+berkata-kata
+berkatak-katak
+berkecai-kecai
+berkecek-kecek
+berkecil-kecil
+berkecil-kecilan
+berkedip-kedip
+berkejang-kejang
+berkejap-kejap
+berkejar-kejaran
+berkelar-kelar
+berkelip-kelip
+berkelit-kelit
+berkelok-kelok
+berkelompok-kelompok
+berkelun-kelun
+berkembur-kembur
+berkempul-kempul
+berkena-kenaan
+berkenal-kenalan
+berkendur-kendur
+berkeok-keok
+berkepak-kepak
+berkepal-kepal
+berkeping-keping
+berkepul-kepul
+berkeras-kerasan
+berkeritik-keritik
+berkeruit-keruit
+berkerut-kerut
+berketak-ketak
+berketak-ketik
+berketi-keti
+berketil-ketil
+berketuk-ketak
+berketul-ketul
+berkial-kial
+berkian-kian
+berkias-kiasan
+berkibar-kibar
+berkilah-kilah
+berkilat-kilat
+berkilau-kilauan
+berkilo-kilo
+berkinja-kinja
+berkipas-kipas
+berkira-kira
+berkirim-kiriman
+berkobar-kobar
+berkobok-kobok
+berkocak-kocak
+berkodi-kodi
+berkolek-kolek
+berkopah-kopah
+berkotak-kotak
+berkuat-kuatan
+berkunang-kunang
+berkurun-kurun
+berkusau-kusau
+berkusu-kusu
+berkusut-kusut
+berkuting-kuting
+berkutu-kutuan
+berlabun-labun
+berlain-lainan
+berlalai-lalai
+berlama-lama
+berlambai-lambai
+berlambak-lambak
+berlampang-lampang
+berlapang-lapang
+berlapis-lapis
+berlapuk-lapuk
+berlarah-larah
+berlarat-larat
+berlari-larian
+berlarik-larik
+berlarut-larut
+berlawak-lawak
+berlayap-layapan
+berlebih-lebih
+berlebih-lebihan
+berlekas-lekas
+berlena-lena
+berlengah-lengah
+berlenggek-lenggek
+berlenggok-lenggok
+berleret-leret
+berliang-liuk
+berliku-liku
+berlimpah-limpah
+berlimpap-limpap
+berlimpit-limpit
+berlinang-linang
+berlindak-lindak
+berlipat-lipat
+berlompok-lompok
+berloncat-loncatan
+berlopak-lopak
+berlubang-lubang
+bermaaf-maafan
+bermacam-macam
+bermain-main
+bermalas-malas
+bermanik-manik
+bermanis-manis
+bermanja-manja
+bermasak-masak
+bermati-mati
+bermegah-megah
+bermemek-memek
+bermesra-mesraan
+bermewah-mewah
+berminggu-minggu
+berminta-minta
+bermuda-muda
+bermudah-mudah
+bermuka-muka
+bermula-mula
+bermulut-mulut
+bernafsi-nafsi
+bernaka-naka
+berniat-niat
+berogak-ogak
+beroleng-oleng
+berolok-olok
+beromong-omong
+beronggok-onggok
+berorang-orang
+beroyal-royal
+berpada-pada
+berpahit-pahit
+berpair-pair
+berpal-pal
+berpalu-palu
+berpalu-paluan
+berpalun-palun
+berpandai-pandai
+berpandang-pandangan
+berpangkat-pangkat
+berpanjang-panjang
+berpasang-pasang
+berpasang-pasangan
+berpayah-payah
+berpeluh-peluh
+berpeluk-pelukan
+berpenat-penat
+berpencar-pencar
+berpendar-pendar
+berpenggal-penggal
+berperai-perai
+berpesai-pesai
+berpesta-pesta
+berpesuk-pesuk
+berpetak-petak
+berpeti-peti
+berpihak-pihak
+berpijar-pijar
+berpikul-pikul
+berpilih-pilih
+berpilin-pilin
+berpindah-pindah
+berpintal-pintal
+berpirau-pirau
+berpisah-pisah
+berpolah-polah
+berpongah-pongah
+berpontang-panting
+berporah-porah
+berpotong-potong
+berpuak-puak
+berpual-pual
+berpugak-pugak
+berpuluh-puluh
+berpulun-pulun
+berpuntal-puntal
+berpura-pura
+berpusar-pusar
+berpusing-pusing
+berpusu-pusu
+berputar-putar
+bersaf-saf
+bersahut-sahutan
+bersakit-sakit
+bersalah-salahan
+bersalam-salaman
+bersalin-salin
+bersama-sama
+bersambut-sambutan
+bersampan-sampan
+bersantai-santai
+bersapa-sapaan
+bersarang-sarang
+bersedan-sedan
+bersedia-sedia
+bersedu-sedu
+bersekat-sekat
+berselang-selang
+berselang-seli
+bersembur-semburan
+bersempit-sempit
+bersenang-senang
+bersenang-senangkan
+bersenda-senda
+bersendi-sendi
+bersepah-sepah
+bersepi-sepi
+berserak-serak
+berseri-seri
+bersesak-sesak
+bersetai-setai
+bersia-sia
+bersiap-siap
+bersiar-siar
+bersilir-silir
+bersimbur-simburan
+bersinau-sinau
+bersorak-sorai
+bersuap-suapan
+bersudah-sudah
+bersuka-suka
+bersuka-sukaan
+bersuku-suku
+bersumpah-sumpahan
+bersungguh-sungguh
+bersungut-sungut
+bersunyi-sunyi
+bersusah-susah
+bersusuk-susuk
+bersusuk-susukan
+bersutan-sutan
+bertabur-tabur
+bertahu-tahu
+bertahun-tahun
+bertajuk-tajuk
+bertakik-takik
+bertala-tala
+bertali-tali
+bertalu-talu
+bertambah-tambah
+bertanda-tandaan
+bertangis-tangisan
+bertangkil-tangkil
+bertanya-tanya
+bertarik-tarikan
+bertatai-tatai
+bertatih-tatih
+bertawan-tawan
+bertawar-tawaran
+bertebu-tebu
+bertebu-tebukan
+berteguh-teguh
+berteguh-teguhan
+berteka-teki
+bertelau-telau
+bertele-tele
+bertempat-tempat
+bertempuh-tempuh
+bertenang-tenang
+bertenggang-tenggangan
+bertentu-tentu
+bertepek-tepek
+berterang-terang
+berterang-terangan
+bertikam-tikaman
+bertimbal-timbalan
+bertimbun-timbun
+bertimpa-timpa
+bertimpas-timpas
+bertingkah-tingkah
+bertingkat-tingkat
+bertinjau-tinjauan
+bertiras-tiras
+bertitar-titar
+bertoboh-toboh
+bertolak-tolak
+bertolak-tolakan
+bertolong-tolongan
+bertonjol-tonjol
+bertua-tua
+bertua-tuaan
+bertual-tual
+bertubi-tubi
+bertukar-tukar
+bertukar-tukaran
+bertukas-tukas
+bertumpak-tumpak
+bertunda-tunda
+bertunjuk-tunjukan
+bertura-tura
+berturut-turut
+bertutur-tutur
+beruas-ruas
+berubah-ubah
+berulang-alik
+berulang-ulang
+berumbai-rumbai
+berundung-undung
+berunggas-runggas
+berungkur-ungkuran
+beruntai-untai
+beruntun-runtun
+berunyai-unyai
+berupa-rupa
+berura-ura
+beruris-uris
+berurut-urutan
+berwarna-warna
+berwarna-warni
+berwindu-windu
+berwiru-wiru
+beryang-yang
+besar-besaran
+betak-betak
+beti-beti
+betul-betul
+biang-biang
+biar-biar
+biji-bijian
+bila-bila
+bilang-bilang
+bincang-bincut
+bini-binian
+biri-biri
+biru-biru
+bisik-bisik
+biti-biti
+bolak-balik
+bolang-baling
+bongkar-bangkir
+buah-buahan
+buat-buatan
+buaya-buaya
+bubun-bubun
+bugi-bugi
+built-in
+bukan-bukan
+bulan-bulan
+bulan-bulanan
+bulang-bulang
+bulat-bulat
+buli-buli
+bulu-bulu
+buluh-buluh
+bulus-bulus
+bunga-bungaan
+bunuh-membunuh
+bunyi-bunyian
+buru-buru
+burung-burungan
+bye-bye
+cabik-cabik
+caing-caing
+calar-balar
+cara-cara
+carut-marut
+cawi-cawi
+cebar-cebur
+celam-celum
+celangak-celinguk
+celas-celus
+celedang-celedok
+celengkak-celengkok
+cemas-cemas
+centang-perenang
+cepat-cepat
+cerai-berai
+ceruk-menceruk
+ceruk-meruk
+check-up
+chit-chat
+cirit-birit
+cita-cita
+close-up
+closed-circuit
+cobak-cabik
+cobar-cabir
+cola-cala
+compang-camping
+congak-cangit
+congkah-cangkih
+congkah-mangkih
+copak-capik
+corak-carik
+corat-coret
+coreng-moreng
+cuang-caing
+cubung-cubung
+culik-culik
+cuma-cuma
+cumi-cumi
+cungap-cangip
+cupu-cupu
+dahulu-mendahului
+dali-dali
+dapur-dapur
+dari-dari
+daru-daru
+datang-datang
+datang-mendatangi
+daun-daunan
+dawai-dawai
+dayang-dayang
+degap-degap
+dekak-dekak
+dekat-dekat
+dengar-dengaran
+desas-desus
+diam-diam
+do-it-yourself
+dokok-dokok
+dolak-dalik
+dorong-mendorong
+drive-in
+dua-dua
+dua-duanya
+duduk-duduk
+dulang-dulang
+ecek-ecek
+embuh-embuhan
+empek-empek
+empok-empok
+encal-encal
+endap-endap
+endut-endutan
+engah-engah
+enggan-enggan
+engkah-engkah
+entah-berentah
+erang-erot
+erong-erong
+fast-food
+fifty-fifty
+flip-flop
+follow-up
+foya-foya
+gaba-gaba
+gabai-gabai
+gada-gada
+gading-gading
+gado-gado
+gajah-gajahan
+gala-gala
+gali-galian
+galing-galing
+galu-galu
+gamit-gamitan
+gampang-gampangan
+ganal-ganal
+ganda-berganda
+gapah-gopoh
+gara-gara
+garah-garah
+gatal-gatal
+gawar-gawar
+gaya-gayanya
+gedebak-gedebuk
+gelang-gelang
+gelembung-gelembungan
+geli-geli
+geliang-geliut
+geliat-geliut
+gempul-gempul
+gendang-gendang
+genjang-genjot
+gerabak-gerubuk
+gerak-gerik
+gerbas-gerbus
+gerit-gerit
+geruh-gerah
+getak-getuk
+geti-geti
+gila-gila
+gila-gilaan
+gilang-gemilang
+gilap-gemilap
+gili-gili
+giling-giling
+ginang-ginang
+girik-girik
+giring-giring
+go-kart
+golak-galik
+gonta-ganti
+gotong-royong
+gual-gail
+gudu-gudu
+gula-gula
+gulang-gulang
+guna-guna
+guntang-guntang
+gunung-ganang
+gunung-gemunung
+gunung-gunungan
+habis-habis
+habis-habisan
+halai-balai
+half-time
+hampir-hampir
+harap-harapan
+harum-haruman
+hati-hati
+heavy-duty
+hebat-hebatan
+hidup-hidup
+hiru-biru
+hiruk-pikuk
+hubaya-hubaya
+hula-hula
+huru-hara
+ibar-ibar
+icak-icak
+igau-igauan
+ikut-ikut
+ikut-ikutan
+ilam-ilam
+imbang-imbangan
+inang-inang
+inca-binca
+incang-incut
+ingat-ingat
+ingat-ingatan
+ingau-ingauan
+inggang-inggung
+injak-injak
+iras-iras
+iring-iringan
+iseng-iseng
+jadi-jadian
+jala-jala
+jamah-jamahan
+jambu-jambu
+jangan-jangan
+jarang-jarang
+jari-jari
+jaring-jaring
+jarum-jarum
+jauh-jauh
+jawi-jawi
+jebat-jebatan
+jelur-jelir
+jendal-jendul
+jenggar-jenggur
+jentik-jentik
+jerah-jerih
+jolong-jolong
+jongkar-jangkir
+juak-juak
+juang-juang
+julung-julung
+jurai-jurai
+kabu-kabu
+kacang-kacang
+kacang-kacangan
+kacau-balau
+kadang-kadang
+kail-kail
+kait-kait
+kakek-kakek
+kalau-kalau
+kaleng-kalengan
+kalut-malut
+kambing-kambing
+kanak-kanak
+kapa-kapa
+kapan-kapan
+kapu-kapu
+karang-karangan
+karang-mengarang
+kareseh-peseh
+karut-marut
+katang-katang
+kawa-kawa
+kayu-kayuan
+keabu-abuan
+keasyik-asyikan
+kebarat-baratan
+kebasah-basahan
+kebat-kebit
+kebata-bataan
+kebelanda-belandaan
+kebiru-biruan
+kebudak-budakan
+kecil-kecilan
+kecil-mengecil
+kecuh-kecah
+kedek-kedek
+kegadis-gadisan
+kegelap-gelapan
+kegila-gilaan
+kegirang-girangan
+kehijau-hijauan
+kehitam-hitaman
+kejaga-jagaan
+kejingga-jinggaan
+kekabur-kaburan
+kekanak-kanakan
+kekoboi-koboian
+kekuning-kuningan
+kelak-kelik
+kelak-keluk
+kelaki-lakian
+kelang-kelok
+kelap-kelip
+kelek-kelek
+kelek-kelekan
+kelik-kelik
+kelip-kelip
+kelusuh-kelasah
+kelut-melut
+kemak-kemik
+kemalu-maluan
+kemanja-manjaan
+kemarah-marahan
+kemasam-masaman
+kemati-matian
+kemerah-merahan
+kempang-kempis
+kempas-kempis
+kemuda-mudaan
+kena-mengena
+kenal-mengenal
+kenang-kenangan
+kencang-kencung
+kendang-kendang
+kendang-kendangan
+kentung-kentung
+kenyat-kenyit
+kepandir-pandiran
+kepang-kepot
+keperak-perakan
+kepilu-piluan
+kepura-puraan
+keputih-putihan
+kerah-kerahan
+kerancak-rancakan
+kerang-kerangan
+kerang-keroh
+kerang-kerung
+kerap-kerap
+keras-mengerasi
+kercap-kercip
+kercap-kercup
+keriang-keriut
+kernyat-kernyut
+kerong-kerong
+keropas-kerapis
+kertak-kertuk
+keruntang-pungkang
+kesap-kesip
+kesenak-senakan
+kesewenang-wenangan
+kesia-siaan
+kesik-kesik
+kesipu-sipuan
+kesu-kesi
+kesuh-kesih
+kesuk-kesik
+ketergesa-gesaan
+keti-keti
+ketidur-tiduran
+ketiga-tiganya
+ketua-tuaan
+ketuan-tuanan
+keungu-unguan
+kia-kia
+kiak-kiak
+kial-kial
+kiang-kiut
+kibang-kibut
+kicang-kecoh
+kicang-kicu
+kida-kida
+kilau-mengilau
+kili-kili
+kira-kira
+kira-kiraan
+kisi-kisi
+kocah-kacih
+kodok-kodok
+kolang-kaling
+koleh-koleh
+kolong-kolong
+koma-koma
+komat-kamit
+kontal-kantil
+kontang-kanting
+kosak-kasik
+kotak-katik
+kotak-kotak
+kuat-kuat
+kucar-kacir
+kucing-kucing
+kucing-kucingan
+kuda-kuda
+kuda-kudaan
+kudap-kudap
+kulah-kulah
+kulak-kulak
+kulik-kulik
+kulum-kulum
+kumat-kamit
+kunang-kunang
+kupat-kapit
+kupu-kupu
+kura-kura
+kurang-kurang
+kusat-mesat
+kutat-kutet
+kuti-kuti
+labi-labi
+labu-labu
+lagi-lagi
+laguh-lagah
+laki-laki
+lalu-lalang
+lama-kelamaan
+lama-lama
+lamat-lamat
+lambat-lambat
+lancar-lancar
+langak-longok
+langit-langit
+lanja-lanjaan
+lapat-lapat
+large-scale
+lari-lari
+lauk-pauk
+lawah-lawah
+lawak-lawak
+lawi-lawi
+layang-layang
+layu-layuan
+lebih-lebih
+legak-legok
+lekak-lekuk
+lekap-lekup
+lekas-lekas
+lekuh-lekih
+lekup-lekap
+lenggak-lenggok
+lenggok-lenggok
+lengket-lengket
+lentam-lentum
+lentang-lentok
+lentang-lentung
+lepa-lepa
+lerang-lerang
+lereng-lereng
+letah-letai
+letup-letup
+liang-liuk
+lidah-lidah
+line-up
+liuk-liuk
+liung-liung
+lobi-lobi
+lock-up
+lopak-lapik
+lopak-lopak
+lumba-lumba
+lumi-lumi
+luntang-lantung
+lupa-lupa
+lupa-lupaan
+main-mainan
+makan-makanan
+make-up
+malai-malai
+malam-malam
+malar-malar
+mali-mali
+malu-malu
+mana-mana
+manik-manik
+manis-manisan
+mark-up
+masing-masing
+mata-mata
+mati-matian
+maya-maya
+megap-megap
+megrek-megrek
+melak-melak
+melambai-lambai
+melambai-lambaikan
+melambat-lambatkan
+melaun-laun
+melawak-lawak
+melayap-layap
+melayap-layapkan
+melebih-lebihi
+melebih-lebihkan
+melejang-lejangkan
+melengah-lengah
+melihat-lihat
+melimpah-limpah
+melincah-lincah
+meloncat-loncat
+melonco-lonco
+melonjak-lonjak
+memacak-macak
+memaki-maki
+memaksa-maksa
+memandai-mandai
+memanggil-manggil
+memanis-manis
+memanjut-manjut
+memasak-masak
+memata-matai
+mematah-matah
+mematut-matut
+memayah-mayahkan
+membagi-bagikan
+membalik-balik
+membangkit-bangkit
+membayang-bayangi
+membayang-bayangkan
+membelai-belai
+membenar-benar
+membenar-benari
+memberai-beraikan
+membesar-besarkan
+membolak-balikkan
+membuang-buang
+membuat-buat
+membunga-bungai
+memburu-buru
+memburu-burukan
+memburuk-burukkan
+memencak-mencak
+memencar-mencar
+memetak-metak
+memetang-metangkan
+memetir-metir
+memikir-mikirkan
+memilih-milih
+meminang-minang
+meminta-minta
+memisah-misahkan
+memontang-mantingkan
+memperamat-amat
+memperamat-amatkan
+memperbagai-bagaikan
+memperganda-gandakan
+memperganduh-ganduhkan
+mempermacam-macamkan
+memperolok-olokkan
+mempersama-samakan
+mempertubi-tubi
+mempertubi-tubikan
+memperturut-turutkan
+memuja-muja
+memukang-mukang
+memulun-mulun
+memundi-mundi
+memundi-mundikan
+memuyu-muyu
+menagak-nagak
+menakut-nakuti
+menanjur-nanjur
+menanti-nanti
+menari-nari
+mencabik-cabik
+mencabik-cabikkan
+mencaing-caing
+mencak-mencak
+mencakup-cakup
+mencapak-capak
+mencari-cari
+mencarik-carik
+mencarut-carut
+mencengis-cengis
+mencepak-cepak
+mencepuk-cepuk
+mencerai-beraikan
+mencetai-cetai
+menciap-ciap
+menciar-ciar
+mencita-citakan
+menciut-ciut
+mencoang-coang
+mencubit-cubit
+mencuri-curi
+mendecap-decap
+mendengking-dengking
+menderak-derakkan
+menderau-derau
+menderu-deru
+mendesas-desuskan
+mendesus-desus
+mendewa-dewakan
+mendudu-dudu
+menebu-nebu
+menegur-neguri
+mengabung-ngabung
+mengaci-acikan
+mengada-ada
+mengaduk-aduk
+mengagak-agak
+mengagak-agihkan
+mengagut-agut
+mengais-ngais
+mengali-ali
+mengalur-alur
+mengamang-amang
+mengamat-amati
+mengambai-ambaikan
+mengambang-ambang
+mengancak-ancak
+mengangan-angankan
+mengangguk-angguk
+mengangin-anginkan
+mengangkat-angkat
+mengap-mengap
+mengapa-apai
+mengapi-apikan
+mengarah-arahi
+mengata-ngatai
+mengaum-aumkan
+mengejan-ejan
+mengelai-ngelai
+mengelepik-ngelepik
+mengelus-elus
+mengembut-embut
+mengenap-enapkan
+mengenjak-enjak
+mengepak-ngepak
+mengepak-ngepakkan
+menggaba-gabai
+menggalur-galur
+menggamak-gamak
+menggapai-gapai
+menggapai-gapaikan
+menggelepar-gelepar
+menggelepar-geleparkan
+menggemak-gemak
+menggerecak-gerecak
+menggesa-gesakan
+menggili-gili
+menggorek-gorek
+menggosok-gosok
+mengguit-guit
+menghalai-balaikan
+menghinap-hinap
+mengiang-ngiang
+mengibas-ngibas
+mengidam-idamkan
+mengilah-ngilahkan
+mengilai-ilai
+mengilat-ngilatkan
+mengilik-ngilik
+mengimak-imak
+mengiming-iming
+menginjak-injak
+mengipas-ngipas
+mengira-ngira
+mengira-ngirakan
+mengiras-iras
+mengiras-irasi
+mengitar-ngitar
+mengitik-ngitik
+mengogok-ogok
+mengolak-alikkan
+mengoleng-oleng
+mengongkang-ongkang
+mengongkok-ongkok
+mengonyah-anyih
+mengotak-ngatikkan
+mengoyak-ngoyakkan
+mengoyak-oyak
+menguar-nguarkan
+menguar-uarkan
+menguber-uber
+mengubit-ubit
+mengubrak-abrik
+mengucar-ngacirkan
+mengucek-ngucek
+menguik-uik
+menguis-uis
+mengulit-ulit
+menguman-uman
+mengumbang-ambingkan
+mengumpak-umpak
+mengungkat-ungkat
+mengungkit-ungkit
+mengurik-urik
+mengutak-ngatikkan
+mengutik-ngutik
+menimang-nimang
+meningkat-ningkat
+meniru-niru
+meniup-niup
+menjadi-jadi
+menjengek-jengek
+menjengit-jengit
+menjilat-jilat
+mentah-mentah
+mentang-mentang
+menunda-nunda
+menusuk-nusuk
+menyama-nyama
+menyambar-nyambar
+menyanjung-nyanjung
+menyapu-nyapu
+menyarat-nyarat
+menyendi-nyendi
+menyeret-nyeret
+menyeru-nyerukan
+menyia-nyiakan
+menyungguh-nyungguhi
+meraba-raba
+merangkak-rangkak
+merasa-rasai
+meraung-raung
+meraung-raungkan
+merayau-rayau
+merayu-rayu
+mereka-reka
+merelap-relap
+meremah-remah
+meremeh-temehkan
+merempah-rempahi
+merengek-rengek
+merenik-renik
+merenta-renta
+merenyai-renyai
+merintang-rintang
+merintik-rintik
+merobek-robek
+meronta-ronta
+merungus-rungus
+merungut-rungut
+mewarna-warnikan
+meyakin-yakini
+miju-miju
+minta-minta
+moga-moga
+morat-marit
+muda-mudi
+mudah-mudahan
+muka-muka
+mula-mula
+muluk-muluk
+naga-naga
+nanti-nantian
+nasi-nasi
+nasib-nasiban
+nenek-nenek
+nyolong-nyolong
+ogah-ogahan
+ogak-ogak
+olak-alik
+olak-olak
+olang-aling
+olang-alingan
+oleh-oleh
+olok-olok
+olok-olokan
+olong-olong
+on-screen
+onde-onde
+one-to-one
+oneng-oneng
+ongkang-ongkang
+ongol-ongol
+onyah-anyih
+orak-arik
+orang-aring
+orang-orangan
+orok-orok
+orong-orong
+otak-otak
+otak-otakan
+padi-padian
+pagi-pagi
+palas-palas
+paling-paling
+palu-memalu
+panas-panas
+pandang-memandang
+panji-panji
+para-para
+paru-paru
+pasang-memasang
+pasu-pasu
+paya-paya
+pecah-pecah
+pelan-pelan
+pengundang-undang
+perang-perangan
+perintang-rintang
+perlahan-lahan
+perlip-perlipan
+pertama-tama
+perundang-undangan
+pesan-pesan
+piat-piut
+pick-up
+pijak-pijak
+pijar-pijar
+pijat-pijat
+pina-pina
+pisang-pisang
+play-off
+pohon-pohonan
+pokrol-pokrolan
+polang-paling
+poma-poma
+pontang-panting
+porak-parik
+porak-peranda
+potong-memotong
+puji-pujian
+pukang-pukang
+pukul-memukul
+pulang-pergi
+pulut-pulut
+pundi-pundi
+punggung-memunggung
+pura-pura
+pusar-pusar
+push-up
+pusing-pusing
+putus-putus
+rada-rada
+radio-frequency
+ragu-ragu
+rama-rama
+rambu-rambu
+rango-rango
+rasa-rasanya
+rata-rata
+real-time
+rebah-rebah
+rebah-rebahan
+redam-redam
+reka-reka
+reka-rekaan
+remah-remah
+remang-remang
+rembah-rembih
+remeh-temeh
+rempah-rempah
+repuh-repuh
+riang-riang
+ribu-ribu
+rigi-rigi
+robak-rabik
+robat-rabit
+role-play
+roll-on
+rombang-rambing
+ruak-ruak
+ruku-ruku
+rumah-rumah
+rumah-rumahan
+rumput-rumputan
+runding-merunding
+runggu-rangga
+runner-up
+rupa-rupa
+rupa-rupanya
+saban-saban
+sabung-menyabung
+saing-menyaing
+salah-salah
+sama-sama
+samar-samar
+sambar-menyambar
+sambung-bersambung
+sambung-menyambung
+sambut-menyambut
+sampai-sampai
+sandar-menyandar
+sangat-sangat
+sangkut-menyangkut
+sapa-menyapa
+sapu-sapu
+sarit-sarit
+satu-satu
+satu-satunya
+sayup-menyayup
+sayup-sayup
+sayur-mayur
+sayur-sayuran
+sci-fi
+seakal-akal
+seakan-akan
+sealak-alak
+sebaik-baiknya
+sebelah-menyebelah
+sebentar-sebentar
+seberang-menyeberang
+seboleh-bolehnya
+sedalam-dalamnya
+sedang-menyedang
+sedap-sedapan
+sedapat-dapatnya
+sedikit-dikitnya
+sedikit-sedikit
+sedikit-sedikitnya
+seelok-eloknya
+segala-galanya
+segan-menyegan
+segan-menyegani
+segan-segan
+sehari-hari
+sehari-harian
+sejadi-jadinya
+sekali-kali
+sekali-sekali
+sekira-kira
+sekonyong-konyong
+sekuasa-kuasanya
+sekurang-kurangnya
+sela-menyela
+sela-sela
+selama-lamanya
+selambat-lambatnya
+selang-seli
+selang-seling
+selar-belar
+selat-latnya
+selekas-lekasnya
+selepas-lepas
+self-esteem
+self-help
+sema-sema
+semah-semah
+semak-semak
+semalam-malaman
+semasa-masa
+semata-mata
+sembunyi-sembunyi
+sembunyi-sembunyian
+semena-mena
+semenda-menyemenda
+semengga-mengga
+sementang-mentang
+semu-semu
+semut-semutan
+sengal-sengal
+sengau-sengauan
+seolah-olah
+sepala-pala
+sepandai-pandai
+sepetang-petangan
+sepoi-sepoi
+sepuas-puasnya
+serang-menyerang
+seraya-menyeraya
+serba-serbi
+serbah-serbih
+serembah-serembih
+sering-sering
+serta-menyertai
+serta-serta
+sesal-menyesali
+sesudah-sudah
+sesudah-sudahnya
+sesuka-suka
+setempat-setempat
+setengah-setengah
+setidak-tidaknya
+seupaya-upaya
+seupaya-upayanya
+sewaktu-waktu
+sewenang-wenang
+short-term
+sia-sia
+siang-siang
+siapa-siapa
+sibar-sibar
+sibur-sibur
+sida-sida
+siku-siku
+silah-silah
+silang-menyilang
+silir-semilir
+sinar-seminar
+sindir-menyindir
+singgah-menyinggah
+sorak-sorai
+stand-by
+stand-up
+sudu-sudu
+sudung-sudung
+suka-suka
+sulang-menyulang
+sulur-suluran
+sumpah-sumpah
+sumpit-sumpit
+sungguh-sungguh
+sungut-sungut
+suram-suram
+surat-menyurat
+suruh-suruhan
+tabar-tabar
+tabir-mabir
+tabrak-tubruk
+tabuh-tabuhan
+tahu-menahu
+tahu-tahu
+takang-takik
+take-off
+takut-takut
+takut-takutan
+tali-bertali
+tali-tali
+tampak-tampak
+tanam-menanam
+tanam-tanaman
+tanda-tanda
+tangan-menangan
+tangan-tangan
+tanggung-menanggung
+tapa-tapa
+tapak-tapak
+tari-menari
+tari-tarian
+tarik-menarik
+tatah-tatah
+tawak-tawak
+tawang-tawang
+tawar-menawar
+tawar-tawar
+tayum-temayum
+tebu-tebu
+tegak-tegak
+teka-teki
+temas-temas
+tembak-menembak
+temut-temut
+tenggang-menenggang
+teraba-raba
+terambang-ambang
+terang-terang
+terang-terangan
+teranggar-anggar
+terangguk-angguk
+teranggul-anggul
+terangin-angin
+terangkup-angkup
+teranja-anja
+terapung-apung
+terayan-rayan
+terayap-rayap
+terbada-bada
+terbahak-bahak
+terbata-bata
+terbatuk-batuk
+terbayang-bayang
+terbengkil-bengkil
+terbirit-birit
+terbuai-buai
+terbuang-buang
+terburu-buru
+tercangak-cangak
+tercengang-cengang
+tercilap-cilap
+tercongget-congget
+tercungap-cungap
+terdangka-dangka
+terdengih-dengih
+terekeh-ekeh
+terembut-embut
+terembut-rembut
+terengah-engah
+teresak-esak
+tergagap-gagap
+tergagau-gagau
+tergaguk-gaguk
+tergapai-gapai
+tergegap-gegap
+tergegas-gegas
+tergelung-gelung
+tergerenyeng-gerenyeng
+tergesa-gesa
+tergila-gila
+tergontai-gontai
+tergudik-gudik
+terguling-guling
+tergulut-gulut
+terharak-harak
+terharap-harap
+terhengit-hengit
+terhinggut-hinggut
+terigau-igau
+terincut-incut
+teringa-inga
+teringat-ingat
+terinjak-injak
+terjembak-jembak
+terjerit-jerit
+terkadang-kadang
+terkakah-kakah
+terkakak-kakak
+terkanjar-kanjar
+terkapah-kapah
+terkapai-kapai
+terkapung-kapung
+terkatah-katah
+terkatung-katung
+terkecap-kecap
+terkedek-kedek
+terkedip-kedip
+terkejar-kejar
+terkekau-kekau
+terkekeh-kekeh
+terkekek-kekek
+terkelinjat-kelinjat
+terkelip-kelip
+terkempul-kempul
+terkemut-kemut
+terkencar-kencar
+terkepak-kepak
+terkesot-kesot
+terkesut-kesut
+terkial-kial
+terkincak-kincak
+terkindap-kindap
+terkinja-kinja
+terkirai-kirai
+terkitar-kitar
+terkocoh-kocoh
+terkokol-kokol
+terkosel-kosel
+terkoteng-koteng
+terkumpal-kumpal
+terlara-lara
+terlayang-layang
+terlebih-lebih
+terlincah-lincah
+terliuk-liuk
+terlolong-lolong
+terlongong-longong
+termangu-mangu
+termanja-manja
+termata-mata
+termengah-mengah
+termimpi-mimpi
+ternanti-nanti
+terngiang-ngiang
+teroleng-oleng
+terpandang-pandang
+terpecah-pecah
+terpekik-pekik
+terpereh-pereh
+terpikau-pikau
+terpinga-pinga
+terpingkal-pingkal
+terpontang-panting
+terpusing-pusing
+terputus-putus
+tersanga-sanga
+tersaruk-saruk
+tersedan-sedan
+tersedih-sedih
+tersedu-sedu
+tersendat-sendat
+tersendeng-sendeng
+tersengal-sengal
+tersengguk-sengguk
+tersengut-sengut
+tersera-sera
+terserak-serak
+tersetai-setai
+tersia-sia
+tersipu-sipu
+tersoja-soja
+tersungkuk-sungkuk
+tertagak-tagak
+tertahan-tahan
+tertatih-tatih
+tertegun-tegun
+tertekan-tekan
+terteleng-teleng
+terumbang-ambing
+terumbang-umbang
+terungkap-ungkap
+terus-menerus
+terus-terusan
+think-tank
+tiap-tiap
+tiba-tiba
+tidak-tidak
+tidur-tidur
+tie-dye
+tiga-tiganya
+tikam-menikam
+tilik-menilik
+timah-timah
+timang-timangan
+timbang-menimbang
+timu-timu
+tindih-bertindih
+tinjau-meninjau
+tip-off
+tiru-tiruan
+tiup-tiup
+tokak-takik
+tokok-menokok
+tolak-menolak
+tolong-menolong
+top-level
+trade-in
+tua-tua
+tuan-tuan
+tuang-tuang
+tuban-tuban
+tukang-menukang
+tukar-menukar
+tulang-tulangan
+tuli-tuli
+tulis-menulis
+tumbuh-tumbuhan
+tune-up
+tunggang-tunggit
+tupai-tupai
+turun-temurun
+turut-menurut
+turut-turutan
+two-tone
+uar-uar
+ubel-ubel
+ubun-ubun
+ubur-ubur
+uci-uci
+udap-udapan
+ugal-ugalan
+uir-uir
+ujar-ujar
+ukir-mengukir
+ula-ula
+ulak-ulak
+ulang-alik
+ulang-aling
+ulang-ulang
+ulap-ulap
+ular-ular
+ular-ularan
+ulung-ulung
+umang-umang
+umbang-ambing
+umbi-umbian
+umbul-umbul
+umbut-umbut
+uncang-uncit
+undak-undakan
+undang-undang
+unduk-unduk
+undung-undung
+undur-undur
+unggat-unggit
+ungkit-ungkit
+unting-unting
+untung-untung
+untung-untungan
+upside-down
+ura-ura
+uran-uran
+urat-urat
+uring-uringan
+urup-urup
+urup-urupan
+urus-urus
+user-user
+user-useran
+utar-utar
+voice-over
+walk-out
+wangi-wangian
+wanti-wanti
+wara-wara
+warna-warni
+water-cooled
+world-class
+yang-yang
+""".split()
+)
diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py
new file mode 100644
index 000000000..97ab19b6e
--- /dev/null
+++ b/spacy/lang/ms/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ms.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
+    "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
+    "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
+    "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
+    "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
+    "Siapa yang akan memimpin projek itu?",
+    "Siapa perdana menteri Malaysia sekarang?",
+]
diff --git a/spacy/lang/ms/lex_attrs.py b/spacy/lang/ms/lex_attrs.py
new file mode 100644
index 000000000..2088c9955
--- /dev/null
+++ b/spacy/lang/ms/lex_attrs.py
@@ -0,0 +1,65 @@
+import unicodedata
+
+from ...attrs import IS_CURRENCY, LIKE_NUM
+from .punctuation import LIST_CURRENCY
+
+_num_words = [
+    "kosong",
+    "satu",
+    "dua",
+    "tiga",
+    "empat",
+    "lima",
+    "enam",
+    "tujuh",
+    "lapan",
+    "sembilan",
+    "sepuluh",
+    "sebelas",
+    "belas",
+    "puluh",
+    "ratus",
+    "ribu",
+    "juta",
+    "billion",
+    "trillion",
+    "kuadrilion",
+    "kuintilion",
+    "sekstilion",
+    "septilion",
+    "oktilion",
+    "nonilion",
+    "desilion",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text.lower() in _num_words:
+        return True
+    if text.count("-") == 1:
+        _, num = text.split("-")
+        if num.isdigit() or num in _num_words:
+            return True
+    return False
+
+
+def is_currency(text):
+    if text in LIST_CURRENCY:
+        return True
+
+    for char in text:
+        if unicodedata.category(char) != "Sc":
+            return False
+    return True
+
+
+LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
diff --git a/spacy/lang/ms/punctuation.py b/spacy/lang/ms/punctuation.py
new file mode 100644
index 000000000..a8d6c2e8e
--- /dev/null
+++ b/spacy/lang/ms/punctuation.py
@@ -0,0 +1,60 @@
+from ..char_classes import ALPHA, _currency, _units, merge_chars, split_chars
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+
+_units = (
+    _units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
+    "Hz kHz MHz GHz mAh "
+    "ratus rb ribu ribuan "
+    "juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
+)
+_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
+_months = (
+    "Januari Februari Mac April Mei Jun Julai Ogos September "
+    "Oktober November Disember Januari Februari Mac Mei Jun "
+    "Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
+    "Okt Nov Dis"
+)
+
+
+UNITS = merge_chars(_units)
+CURRENCY = merge_chars(_currency)
+HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>"
+HTML_SUFFIX = r"</(b|strong|i|em|p|span|div|a)>"
+MONTHS = merge_chars(_months)
+LIST_CURRENCY = split_chars(_currency)
+
+_prefixes = list(TOKENIZER_PREFIXES)
+_prefixes.remove("#")  # hashtag
+_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", "—"]
+
+_suffixes = (
+    TOKENIZER_SUFFIXES
+    + [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
+    + [
+        # disabled: variable width currency variable
+        # r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9])%",
+        # disabled: variable width HTML_SUFFIX variable
+        # r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
+        r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
+    ]
+)
+
+_infixes = TOKENIZER_INFIXES + [
+    r"(?<=[0-9])[\\/](?=[0-9%-])",
+    r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
+    # disabled: variable width units variable
+    # r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
+    # disabled: variable width months variable
+    # r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
+    r'(?<=[0-9)][.,])"(?=[0-9])',
+    r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
+    r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
+    r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
+    r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
+]
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py
new file mode 100644
index 000000000..b1bfaea79
--- /dev/null
+++ b/spacy/lang/ms/stop_words.py
@@ -0,0 +1,118 @@
+STOP_WORDS = set(
+    """
+ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
+aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
+apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
+awalnya
+
+bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
+bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
+beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
+belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
+berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
+berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
+berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
+berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
+bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
+berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
+boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
+
+cara caranya cukup cukupkah cukuplah cuma
+
+dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
+dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
+diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
+diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
+dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
+diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
+dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
+dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
+diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
+dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
+disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
+ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
+dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
+
+empat enggak enggaknya entah entahlah
+
+guna gunakan
+
+hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
+hendaknya hingga
+
+ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
+inginkan ini inikah inilah itu itukah itulah
+
+jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
+jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
+
+kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
+kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
+keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
+kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
+kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
+khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
+
+lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
+lima luar
+
+macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
+masa masalah masalahnya masih masihkah masing masing-masing mau maupun
+melainkan melakukan melalui melihat melihatnya memang memastikan memberi
+memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
+mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
+mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
+menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
+mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
+mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
+menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
+mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
+menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
+menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
+mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
+misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
+
+nah naik namun nanti nantinya nyaris nyatanya
+
+oleh olehnya
+
+pada padahal padanya pak paling panjang pantas para pasti pastilah penting
+pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
+pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
+
+rasa rasanya rata rupanya
+
+saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
+sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
+sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
+sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
+sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
+sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
+seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
+sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
+seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
+selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
+seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
+semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
+sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
+sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
+serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
+sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
+setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
+siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
+
+tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
+tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
+tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
+terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
+terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
+tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
+tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
+
+ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
+
+waduh wah wahai waktu waktunya walau walaupun wong
+
+yaitu yakin yakni yang
+""".split()
+)
diff --git a/spacy/lang/ms/syntax_iterators.py b/spacy/lang/ms/syntax_iterators.py
new file mode 100644
index 000000000..027798687
--- /dev/null
+++ b/spacy/lang/ms/syntax_iterators.py
@@ -0,0 +1,41 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    # fmt: off
+    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+    # fmt: on
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add("conj")
+    np_label = doc.vocab.strings.add("NP")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            prev_end = word.right_edge.i
+            yield word.left_edge.i, word.right_edge.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.right_edge.i
+                yield word.left_edge.i, word.right_edge.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ms/tokenizer_exceptions.py b/spacy/lang/ms/tokenizer_exceptions.py
new file mode 100644
index 000000000..e8b53fed8
--- /dev/null
+++ b/spacy/lang/ms/tokenizer_exceptions.py
@@ -0,0 +1,1532 @@
+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ._tokenizer_exceptions_list import MS_BASE_EXCEPTIONS
+
+# Daftar singkatan dan Akronim dari:
+# https://ms.wiktionary.org/wiki/Wiktionary:Senarai_akronim_dan_singkatan
+
+_exc = {}
+
+for orth in MS_BASE_EXCEPTIONS:
+    _exc[orth] = [{ORTH: orth}]
+    orth_title = orth.title()
+    _exc[orth_title] = [{ORTH: orth_title}]
+    orth_caps = orth.upper()
+    _exc[orth_caps] = [{ORTH: orth_caps}]
+    orth_lower = orth.lower()
+    _exc[orth_lower] = [{ORTH: orth_lower}]
+    orth_first_upper = orth[0].upper() + orth[1:]
+    _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
+    if "-" in orth:
+        orth_title = "-".join([part.title() for part in orth.split("-")])
+        _exc[orth_title] = [{ORTH: orth_title}]
+        orth_caps = "-".join([part.upper() for part in orth.split("-")])
+        _exc[orth_caps] = [{ORTH: orth_caps}]
+
+for exc_data in [
+    {ORTH: "Jan.", NORM: "Januari"},
+    {ORTH: "Feb.", NORM: "Februari"},
+    {ORTH: "Mac.", NORM: "Mac"},
+    {ORTH: "Apr.", NORM: "April"},
+    {ORTH: "Jun.", NORM: "Jun"},
+    {ORTH: "Jul.", NORM: "Julai"},
+    {ORTH: "Ogos.", NORM: "Ogos"},
+    {ORTH: "Sep.", NORM: "September"},
+    {ORTH: "Okt.", NORM: "Oktober"},
+    {ORTH: "Nov.", NORM: "November"},
+    {ORTH: "Dis.", NORM: "Disember"},
+]:
+    _exc[exc_data[ORTH]] = [exc_data]
+
+_other_exc = {
+    "do'a": [{ORTH: "do'a", NORM: "doa"}],
+    "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
+    "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
+    "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
+    "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
+    "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
+    "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
+    "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
+    "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
+    "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
+}
+
+_exc.update(_other_exc)
+
+for orth in [
+    "1 Kor.",
+    "1 Ptr.",
+    "1 Raj.",
+    "1 Sam.",
+    "1 Taw.",
+    "1 Tes.",
+    "1 Tim.",
+    "1 Yoh.",
+    "1Ch.",
+    "1Co.",
+    "1Jo.",
+    "1Ki.",
+    "1Pe.",
+    "1Sa.",
+    "1Th.",
+    "1Ti.",
+    "2 Kor.",
+    "2 Ptr.",
+    "2 Raj.",
+    "2 Sam.",
+    "2 Taw.",
+    "2 Tes.",
+    "2 Tim.",
+    "2 Yoh.",
+    "2Ch.",
+    "2Co.",
+    "2Jo.",
+    "2Ki.",
+    "2Pe.",
+    "2Sa.",
+    "2Th.",
+    "2Ti.",
+    "3 Yoh.",
+    "3D",
+    "3F",
+    "3Jo.",
+    "3M",
+    "8MP",
+    "AA",
+    "AAAAAA",
+    "AB",
+    "Abd.",
+    "ABC",
+    "ABIM",
+    "ABM",
+    "ABMI",
+    "ABS",
+    "AC",
+    "Ac",
+    "ACAPLPL",
+    "Act.",
+    "AD",
+    "AD LIB",
+    "ADAM",
+    "ADB",
+    "ADD",
+    "ADIL",
+    "ADN",
+    "ADR",
+    "ADRI",
+    "ADSL",
+    "ADUN",
+    "AFAS",
+    "AFTA",
+    "Ag",
+    "AGMARIS",
+    "AH",
+    "AI",
+    "AIA",
+    "AIDS",
+    "AIJV",
+    "AIM",
+    "a/k",
+    "ak",
+    "AKN",
+    "Al",
+    "a/l",
+    "AM",
+    "Am",
+    "Am.",
+    "AMN",
+    "Amo.",
+    "AMPS",
+    "Ams.",
+    "AMWA",
+    "AN",
+    "a.n.",
+    "ANGKASA",
+    "ANM",
+    "ANSI",
+    "Ant.",
+    "AOL",
+    "AP",
+    "a/p",
+    "APD",
+    "APEC",
+    "API",
+    "APIK",
+    "APM",
+    "APN",
+    "APP",
+    "Apr.",
+    "APRI",
+    "Ar",
+    "Ar.",
+    "ark.",
+    "A.S.",
+    "As",
+    "a.s.",
+    "ASA",
+    "ASAS 50",
+    "ASB",
+    "ASCII",
+    "ASEAN",
+    "ASEAN+3",
+    "ASEM",
+    "a.s.f.",
+    "ASN",
+    "a.s.o.",
+    "ASP",
+    "Ast.",
+    "A.T.",
+    "At",
+    "ATM",
+    "a.t.r.",
+    "ATUR",
+    "Au",
+    "AURI",
+    "Aug.",
+    "AWOL",
+    "Ayb.",
+    "B",
+    "BA",
+    "Ba",
+    "BAC",
+    "BAFIA",
+    "BAM",
+    "BANANA",
+    "BAPP",
+    "BASF",
+    "BATA",
+    "BB",
+    "BBC",
+    "BBE",
+    "BBS",
+    "BC",
+    "BCG",
+    "BCIC",
+    "b.d.",
+    "BDSSHAM",
+    "Be",
+    "BEER",
+    "BERNAMA",
+    "Bh",
+    "b.h.",
+    "Bhd.",
+    "Bi",
+    "BIDS",
+    "Bil.",
+    "bil.",
+    "BIMP-EAGA",
+    "Bio.",
+    "BIOS",
+    "BITMB",
+    "BJ",
+    "Bk",
+    "b.k.",
+    "BKAL",
+    "bkn.",
+    "BKP",
+    "BL",
+    "BLR",
+    "BM",
+    "BMI",
+    "BMW",
+    "BN",
+    "BNM",
+    "BO",
+    "BOJ",
+    "BOO",
+    "BOP",
+    "BOT",
+    "BP",
+    "b.p.",
+    "BPA",
+    "BPAs",
+    "bpd.",
+    "BPIMB",
+    "BPM",
+    "BPO",
+    "BPPH",
+    "Br",
+    "Br.",
+    "BSA",
+    "B.Sc.",
+    "B.Sh.",
+    "b.s.j.",
+    "BSN",
+    "Bt.",
+    "bt.",
+    "BWT",
+    "BYOB",
+    "C",
+    "C.",
+    "C/E",
+    "Ca",
+    "CAAM",
+    "CAD",
+    "CAM",
+    "CATV",
+    "CBS",
+    "CBT",
+    "CC",
+    "CCD",
+    "CCM",
+    "CCR",
+    "cct-km",
+    "CCTV",
+    "CCU",
+    "CD",
+    "Cd",
+    "CD-ROM",
+    "CD-RW",
+    "CDRC",
+    "Ce",
+    "CEO",
+    "CEPT",
+    "Cetak",
+    "Cf",
+    "CFO",
+    "CFTC",
+    "CGC",
+    "CGI",
+    "CH",
+    "CIA",
+    "CIAST",
+    "CID",
+    "CIDB",
+    "CIQ",
+    "CKD",
+    "CL",
+    "Cl",
+    "c.l.",
+    "CLI",
+    "CLOB",
+    "CM",
+    "Cm",
+    "cm.",
+    "CMAG",
+    "CMI",
+    "CMP",
+    "CNN",
+    "Co",
+    "COD",
+    "Col.",
+    "COLA",
+    "COMDEX",
+    "CP",
+    "CPI",
+    "CPO",
+    "CPR",
+    "CPU",
+    "Cr",
+    "CRDF",
+    "Cs",
+    "CST",
+    "CT",
+    "CTIP",
+    "CTRM",
+    "Cu",
+    "CUEPACS",
+    "D-8",
+    "d/a",
+    "DAGS",
+    "Dan.",
+    "DANCED",
+    "DAP",
+    "DARA",
+    "Db",
+    "DBKL",
+    "DBP",
+    "DBR",
+    "DC",
+    "DDA",
+    "DDT",
+    "DEB",
+    "Dec.",
+    "Deu.",
+    "DFIs",
+    "dgn.",
+    "DHL",
+    "DIBML",
+    "DIN",
+    "Dis.",
+    "DJ",
+    "d.l.l.",
+    "dlm.",
+    "dng.",
+    "DNS",
+    "DO",
+    "DOA",
+    "DOE",
+    "DOF",
+    "DOSH",
+    "doz.",
+    "DPPS",
+    "Dr.",
+    "dr.",
+    "drp.",
+    "drpd.",
+    "Ds",
+    "d.sb.",
+    "d.st.",
+    "DSTN2",
+    "Dt.",
+    "DTAs",
+    "DTMF",
+    "DTP",
+    "DTV",
+    "DUBES",
+    "DUNHILL",
+    "DV8",
+    "DVD",
+    "DVE",
+    "DVS",
+    "dw.t.",
+    "Dy",
+    "DYMM",
+    "E",
+    "E-Commerce",
+    "E-Dagang",
+    "E&E",
+    "E-Faraid",
+    "E-Government",
+    "E-Kerajaan",
+    "E-Mail",
+    "E-Services",
+    "E-Village",
+    "E-Zine",
+    "EALAF",
+    "EBI",
+    "EBP",
+    "EC",
+    "ECAFE",
+    "Ecc.",
+    "ECI",
+    "ECM",
+    "ECOSOC",
+    "ECP",
+    "ECR",
+    "EDI",
+    "EE",
+    "EEC",
+    "Ef.",
+    "EG",
+    "Eko.",
+    "EKS",
+    "ELWS",
+    "ELX",
+    "EMI",
+    "EMUs",
+    "En.",
+    "EP",
+    "EPF",
+    "Eph.",
+    "EPP",
+    "EPS",
+    "EPU",
+    "ER",
+    "Er",
+    "ERL",
+    "ERT",
+    "Es",
+    "ESCAP",
+    "ESOS",
+    "ESP",
+    "EST",
+    "Est.",
+    "ET",
+    "ETA",
+    "ETACS",
+    "ETC",
+    "ETD",
+    "EU",
+    "Eu",
+    "EVIAN",
+    "Exim Bank",
+    "Exo.",
+    "Eze.",
+    "Ezr.",
+    "F",
+    "FAM",
+    "FAMA",
+    "FAO",
+    "FAQ",
+    "FAX",
+    "FBI",
+    "FC",
+    "FCA",
+    "FCC",
+    "FDI",
+    "FE",
+    "Fe",
+    "f.e.",
+    "Feb.",
+    "FELCRA",
+    "FELDA",
+    "FI",
+    "FIA 1993",
+    "FIAT",
+    "FIC",
+    "FIDA",
+    "FIFA",
+    "FIMA",
+    "Fiz.",
+    "Flm.",
+    "Flp.",
+    "FM",
+    "Fm",
+    "FMUTM",
+    "FO",
+    "FOA",
+    "FOB",
+    "FOC",
+    "FOMCA",
+    "FORD",
+    "Fr",
+    "FRIM",
+    "FRTI",
+    "FSMP",
+    "FTA",
+    "FTE",
+    "FTP",
+    "G",
+    "g.",
+    "G15",
+    "G77",
+    "Ga",
+    "GAC",
+    "GACM",
+    "Gal.",
+    "GAPENA",
+    "GATS",
+    "GATT",
+    "GB",
+    "Gbps.",
+    "Gd",
+    "GDP",
+    "Ge",
+    "GEC",
+    "Gen.",
+    "Geo.",
+    "Geog.",
+    "Gerakan",
+    "GH",
+    "GIF",
+    "GII",
+    "GIS",
+    "GITIC",
+    "GITN",
+    "GJ",
+    "GLCs",
+    "GM",
+    "GMBH",
+    "GMI",
+    "GMT",
+    "GNP",
+    "GNS",
+    "GOLD",
+    "GP",
+    "GPC",
+    "GPIM",
+    "GPMS",
+    "GPO",
+    "GPP",
+    "GPS",
+    "GRO",
+    "GRS",
+    "GSMC",
+    "GST",
+    "GTZ",
+    "GUI",
+    "GWh.",
+    "H",
+    "Ha",
+    "Hab.",
+    "Hag.",
+    "Hak.",
+    "ham",
+    "hb.",
+    "HCI",
+    "HDTV",
+    "He",
+    "Heb.",
+    "Hf",
+    "Hg",
+    "HI-FI",
+    "HIS",
+    "HIV",
+    "Hj.",
+    "HMS",
+    "Ho",
+    "Hos.",
+    "HP",
+    "HRDC",
+    "HRDF",
+    "HRMIS",
+    "Hs",
+    "Hut.",
+    "I",
+    "I/O",
+    "IA",
+    "IAA",
+    "IADPs",
+    "IB",
+    "i.b.",
+    "IBA",
+    "IBFIM",
+    "IBG",
+    "Ibr.",
+    "IBRD",
+    "IBS",
+    "IC",
+    "ICA",
+    "ICBM",
+    "ICFM",
+    "ICI",
+    "ICM",
+    "ICOR",
+    "ICP",
+    "ICT",
+    "ICU",
+    "ID",
+    "Id.",
+    "IDB",
+    "IDFR",
+    "IE",
+    "i.e.",
+    "IFSB",
+    "IGAs",
+    "IGS",
+    "IHP",
+    "IHPG",
+    "IIM",
+    "IINA",
+    "IKKL",
+    "IKP",
+    "IKPH",
+    "IKS",
+    "Im.",
+    "IMD",
+    "IMF",
+    "IMP2",
+    "IMR",
+    "IMS-GT",
+    "IMT-GT",
+    "In",
+    "in.",
+    "INFRA",
+    "INSEP",
+    "INSPEN",
+    "INTAN",
+    "IOFC",
+    "IOU",
+    "IP",
+    "IPA",
+    "IPBA",
+    "IPCs",
+    "IPEBP",
+    "IPI",
+    "IPKIM",
+    "IPKPM",
+    "IPO",
+    "IPP",
+    "IPPM",
+    "IPPPM",
+    "i.pt.",
+    "IPTAR",
+    "IPTNM",
+    "IQR",
+    "Ir",
+    "IRA",
+    "IRPA",
+    "IRS",
+    "i.s.",
+    "ISA",
+    "Isa.",
+    "ISDN",
+    "ISMM",
+    "ISO",
+    "ISP",
+    "ist.",
+    "IT",
+    "i.t.",
+    "ITA",
+    "ITAF",
+    "ITEX",
+    "ITK",
+    "ITM",
+    "ITO",
+    "ITRCo",
+    "ITTA",
+    "ITU",
+    "JAK",
+    "JAKIM",
+    "Jam.",
+    "Jan.",
+    "Jb.",
+    "JBIC",
+    "JD",
+    "JDA",
+    "Jdg.",
+    "Jer.",
+    "Jh.",
+    "JICA",
+    "JJ",
+    "Jk.",
+    "JKKK",
+    "jkps.",
+    "JKR",
+    "JMTI",
+    "JOA",
+    "Joe.",
+    "Joh.",
+    "Jon.",
+    "Jos.",
+    "JP",
+    "JPA",
+    "JPEG",
+    "JPH",
+    "JPJ",
+    "JPSHK",
+    "JPS",
+    "JPT",
+    "JRDA",
+    "JSM",
+    "JT",
+    "Jud.",
+    "Jul.",
+    "Jun.",
+    "JVC",
+    "Jw.",
+    "K",
+    "K-Economy",
+    "KADA",
+    "KBE",
+    "KBIA",
+    "KBPA",
+    "KBSM",
+    "KD",
+    "Kd.",
+    "KDI",
+    "KDN",
+    "KDNK",
+    "KE",
+    "KEAP",
+    "Kej.",
+    "Kel.",
+    "KEM",
+    "KEMLU",
+    "kep.",
+    "Kg.",
+    "kg.",
+    "KGB",
+    "KGK",
+    "KH",
+    "ki.",
+    "Kid.",
+    "KIK",
+    "KIKMTT",
+    "KIM",
+    "Kim.",
+    "Kis.",
+    "KIX",
+    "KKGSK",
+    "KKK",
+    "KKPPA",
+    "KL",
+    "Kl.",
+    "KLCI",
+    "KLIA",
+    "KLIBOR",
+    "KLIM",
+    "KLM",
+    "KLSE",
+    "KM",
+    "KMM",
+    "KNK",
+    "KO",
+    "Kol.",
+    "Kom.",
+    "Komp.",
+    "KOMSAS",
+    "KPAI",
+    "KPB",
+    "KPBA",
+    "KPC",
+    "kpd.",
+    "KPE",
+    "KPIs",
+    "KPPL",
+    "KPPMS",
+    "KPWM",
+    "Kr",
+    "KRM",
+    "KSTI",
+    "KT",
+    "KTA",
+    "KTABKL",
+    "KTM",
+    "KTMB",
+    "kV",
+    "kW",
+    "kWh",
+    "kWj",
+    "KWSP",
+    "LA",
+    "La",
+    "LABOR",
+    "Lam.",
+    "LAN",
+    "LAPD",
+    "LASER",
+    "LAX",
+    "lb.",
+    "LC",
+    "LCD",
+    "LCHRF",
+    "LCLY",
+    "LED",
+    "Lev.",
+    "LFPR",
+    "LFS",
+    "LFX",
+    "LGM",
+    "Li",
+    "LID",
+    "Lin.",
+    "LKN",
+    "LKPM",
+    "LKPP",
+    "LKTP",
+    "LKWJ",
+    "LLB",
+    "LLC",
+    "LLN",
+    "LLS",
+    "LMSM",
+    "LNG",
+    "LOA",
+    "LOBATA",
+    "LOFSA",
+    "LPG",
+    "LPIP",
+    "LPKI",
+    "LPKLPL",
+    "LPKN",
+    "LPN",
+    "LPP",
+    "LPPK",
+    "LPPM",
+    "LPPP",
+    "LPPTP",
+    "Lr",
+    "LRs",
+    "LRT",
+    "LS",
+    "LTAKL",
+    "LTD",
+    "LTK",
+    "Lu",
+    "LUAS",
+    "Luk.",
+    "lw.",
+    "lwn.",
+    "M\n",
+    "m",
+    "M&A",
+    "MAB",
+    "MACRES",
+    "MAD",
+    "MADA",
+    "MAGERAN",
+    "MAHA",
+    "MAHSURI",
+    "Mal.",
+    "MALINDO",
+    "MAMPU",
+    "Mar.",
+    "MARA",
+    "MARC",
+    "MARDI",
+    "MARLBORO",
+    "MAS",
+    "MASSA",
+    "MASSCORP",
+    "Mat.",
+    "MATRADE",
+    "MAVCAP",
+    "MB",
+    "MBA",
+    "MBBS",
+    "MBM",
+    "MBO",
+    "MBS",
+    "MBTU",
+    "MC",
+    "MCA",
+    "MCB",
+    "MCSL",
+    "MCSv5",
+    "MD",
+    "Md",
+    "MDB",
+    "MDC",
+    "MDG",
+    "MDV",
+    "MEASAT",
+    "MEATJ",
+    "MECIB",
+    "MEMO",
+    "MENLU",
+    "MEPS",
+    "MES",
+    "MESDAQ",
+    "METEOR",
+    "MFI",
+    "MFIs",
+    "MG",
+    "Mg",
+    "MGM",
+    "MGR",
+    "MGS",
+    "MHA",
+    "Mi.",
+    "MIA",
+    "MIB",
+    "MIC",
+    "Mic.",
+    "MICE",
+    "MIDA",
+    "MIDF",
+    "MIDI",
+    "MIG",
+    "MIGHT",
+    "MII",
+    "MIMOS",
+    "MINDEF",
+    "MINT",
+    "mis.",
+    "MIT",
+    "MITC",
+    "MITI",
+    "Ml.",
+    "MLNG",
+    "mlpd.",
+    "MM",
+    "mm",
+    "MMN",
+    "mmscfd.",
+    "MMU",
+    "MMX",
+    "Mn",
+    "Mn.",
+    "MNA",
+    "MNCs",
+    "MO",
+    "Mo",
+    "MOA",
+    "MOD",
+    "MODEM",
+    "MOE",
+    "MOH",
+    "MOSTE",
+    "MOSTI",
+    "MOU",
+    "MP",
+    "MPB",
+    "MPEG",
+    "MPOB",
+    "MPP",
+    "mppa.",
+    "MPPJ",
+    "MPS",
+    "MPTM",
+    "MR",
+    "m.r.",
+    "MRB",
+    "MRELB",
+    "Mrk.",
+    "MRRDB",
+    "MS",
+    "MS-DOS",
+    "MSC",
+    "MSG",
+    "MSM",
+    "Mt",
+    "MTC",
+    "MTCP",
+    "MTD",
+    "MTDC",
+    "MTPB",
+    "MTV",
+    "Muz.",
+    "MV",
+    "MW",
+    "MY",
+    "MyKe",
+    "Mzm.",
+    "N",
+    "N/A",
+    "Na",
+    "NAB",
+    "NACIWID",
+    "Nah.",
+    "NAP",
+    "NASA",
+    "NATO",
+    "NAV",
+    "NB",
+    "Nb",
+    "NBA",
+    "NBC",
+    "NCR",
+    "Nd",
+    "NDP",
+    "Ne",
+    "NEAC",
+    "NEC",
+    "NEF",
+    "Neh.",
+    "NEP",
+    "NEqO",
+    "NERP",
+    "NF",
+    "NFPEs",
+    "NG",
+    "NGOs",
+    "NGV",
+    "NHEF",
+    "NHHES",
+    "NHK",
+    "Ni",
+    "NIDC",
+    "NIH",
+    "NIP",
+    "NIPA",
+    "NIS",
+    "NISIR",
+    "NITA",
+    "NITC",
+    "NITP",
+    "NIV",
+    "NLAC",
+    "NMPBSP",
+    "NMU",
+    "No",
+    "No.",
+    "no.",
+    "NOSS",
+    "Nov.",
+    "Np",
+    "NPC",
+    "NPCS",
+    "NPL",
+    "NRCC",
+    "NRW",
+    "NS",
+    "Ns",
+    "NSB",
+    "NTA",
+    "NTHRDC",
+    "NTMP",
+    "NTSC",
+    "Num.",
+    "NUTF",
+    "NVP",
+    "NVTC",
+    "NWRC",
+    "O",
+    "Ob.",
+    "Oba.",
+    "OC",
+    "OCPD",
+    "Oct.",
+    "OD",
+    "ODA",
+    "OECD",
+    "OEM",
+    "Ogo.",
+    "OHQs",
+    "OIC",
+    "Okt.",
+    "OPEC",
+    "OPP",
+    "OPP3",
+    "OPR",
+    "OS",
+    "Os",
+    "OSA",
+    "OT",
+    "OUG",
+    "oz.",
+    "P",
+    "P&P",
+    "PA",
+    "Pa",
+    "PABK",
+    "PABX",
+    "PAK",
+    "PAKSI",
+    "PAL",
+    "PALL MALL",
+    "PAS",
+    "PATA",
+    "PAWS",
+    "Pb",
+    "PBA",
+    "PBB",
+    "PBM",
+    "PBP",
+    "PBSM",
+    "PBT",
+    "PC",
+    "PC(s)",
+    "PCB",
+    "PCIRITA",
+    "PCM",
+    "PCMCIA",
+    "PCN",
+    "PD",
+    "Pd",
+    "pd.",
+    "PDS",
+    "PE",
+    "PEKEMAS",
+    "PEMADAM",
+    "PENA",
+    "PENIS",
+    "PERDANA",
+    "PERKESO",
+    "PERKIM",
+    "PERNAS",
+    "PERTAMA",
+    "PERTIWI",
+    "PESAKA",
+    "PETA",
+    "PETRONAS",
+    "PGU",
+    "Ph.",
+    "PHD",
+    "Phi.",
+    "Phm.",
+    "PIK",
+    "PIKOM",
+    "PIN",
+    "PINTAS",
+    "PIPM",
+    "PISK",
+    "PITA",
+    "PIXEL",
+    "PJ",
+    "PJK",
+    "PJKB",
+    "PJP",
+    "PKBM",
+    "PKBTA",
+    "PKEN",
+    "Pkh.",
+    "PKKM",
+    "PKLPA",
+    "PKM",
+    "PKNS",
+    "PKPIM",
+    "PKPM",
+    "PKR",
+    "PKS",
+    "Pl.",
+    "p.l.",
+    "PLA",
+    "PLC",
+    "PLCHP",
+    "PLCs",
+    "PLI",
+    "PLT",
+    "PLUS",
+    "PLWS",
+    "PM",
+    "Pm",
+    "PMM",
+    "PMP",
+    "PMR",
+    "PMS",
+    "Pn.",
+    "PNAT",
+    "PNS",
+    "PO",
+    "Po",
+    "POCPA",
+    "POKEMON",
+    "Pol.",
+    "POP",
+    "PORIM",
+    "PORLA",
+    "PORTAFOAM",
+    "PP",
+    "PPA",
+    "PPBE",
+    "PPBK",
+    "ppd.",
+    "PPGM",
+    "PPI",
+    "PPK",
+    "PPL",
+    "PPM",
+    "PPP",
+    "PPPB",
+    "PPPLM",
+    "PPPM",
+    "PPR",
+    "PPRT",
+    "PPS",
+    "PPTM",
+    "PPU",
+    "PR",
+    "Pr",
+    "Pr.",
+    "prb.",
+    "PRI",
+    "PRO",
+    "Pro.",
+    "Prof.",
+    "PROSPER",
+    "PROSTAR",
+    "PROTON",
+    "PS",
+    "PSA",
+    "Psa.",
+    "PSCs",
+    "PSDC",
+    "PSDH",
+    "Psi.",
+    "PSKE",
+    "PSRM",
+    "PST",
+    "PT",
+    "Pt",
+    "PTD",
+    "PTP",
+    "Pu",
+    "PUNB",
+    "QA",
+    "QC",
+    "QCC",
+    "R&D",
+    "RA",
+    "Ra",
+    "RAM",
+    "RAPP",
+    "Rat.",
+    "Rb",
+    "RCA",
+    "RDA",
+    "RDAs",
+    "RDCs",
+    "RE",
+    "Re",
+    "REHDA",
+    "Rev.",
+    "Rf",
+    "Rg",
+    "RGB",
+    "Rh",
+    "RI",
+    "RIDA",
+    "RIP",
+    "RISDA",
+    "r.l.",
+    "RM",
+    "Rm.",
+    "RMKe-8",
+    "Rn",
+    "ROC",
+    "ROM",
+    "Rom.",
+    "RPG",
+    "RPS",
+    "RRI",
+    "RRIM",
+    "RRJP",
+    "RRP",
+    "RSGC",
+    "RSS",
+    "RSVP",
+    "Rt.",
+    "RTA",
+    "RTM",
+    "Ru",
+    "Rut.",
+    "RWCR",
+    "RX",
+    "S",
+    "S/N",
+    "S&T",
+    "S-VHS",
+    "SA",
+    "SAC",
+    "SADCs",
+    "SAGA",
+    "SALCRA",
+    "SALM",
+    "SALT",
+    "SAM",
+    "SAP",
+    "SARS",
+    "Sas.",
+    "s.a.w.",
+    "SB",
+    "Sb",
+    "Sb.",
+    "SBA",
+    "SBB",
+    "sbg.",
+    "SBK",
+    "SC",
+    "Sc",
+    "SCA",
+    "SCADA",
+    "SCANS",
+    "SCSI",
+    "SCuM",
+    "SDCs",
+    "Sdn. Bhd.",
+    "sdr.",
+    "SDRC",
+    "Se",
+    "SEATO",
+    "SEB",
+    "SECAM",
+    "SEDCs",
+    "SEFF",
+    "Sej.",
+    "SEMS",
+    "Sep.",
+    "Sept.",
+    "SESB",
+    "SESCo",
+    "s.f.",
+    "Sg",
+    "SGPCA",
+    "SGPPI",
+    "SGPPKRM",
+    "SGX",
+    "Si",
+    "Si.",
+    "SIA 1983",
+    "SIC",
+    "SIM",
+    "SING",
+    "SIRIM",
+    "SITTDEC",
+    "sj.",
+    "SKDTP",
+    "SKM",
+    "SKSM",
+    "SL",
+    "Sl.",
+    "sl.",
+    "SLMCH",
+    "SLR",
+    "SM",
+    "Sm",
+    "SMART",
+    "SMEs",
+    "SMEt",
+    "SMIs",
+    "SMIDEC",
+    "SMIDP",
+    "SMJK",
+    "SMR",
+    "SMS",
+    "SMT",
+    "SMTP",
+    "SN",
+    "Sn",
+    "SOB",
+    "SOCSO",
+    "SOHO",
+    "Son.",
+    "SOS",
+    "Sos.",
+    "SP",
+    "SPA",
+    "SPAM",
+    "SPCA",
+    "SPKR",
+    "SPLAM",
+    "SPM",
+    "SPNB",
+    "SPSP",
+    "t.",
+    "Ta",
+    "Tadb.",
+    "TAF",
+    "TAF-W",
+    "Tani",
+    "TAP",
+    "TAR",
+    "TARBI",
+    "TB",
+    "Tb",
+    "TBA",
+    "TBTP",
+    "Tc",
+    "TCPD",
+    "TDCs",
+    "Te",
+    "TEKUN",
+    "TELCO",
+    "TELEX",
+    "TEUs",
+    "TFP",
+    "TGV",
+    "TH",
+    "Th",
+    "THIS",
+    "Ti",
+    "TICAD",
+    "Tit.",
+    "TKA",
+    "Tks.",
+    "Tl",
+    "TLDM",
+    "TM",
+    "Tm",
+    "TMB",
+    "TMK",
+    "TNB",
+    "TNSB",
+    "TNT",
+    "TOEFL",
+    "TP",
+    "TPIM",
+    "TPK",
+    "TPPP",
+    "TPPT",
+    "TPSM",
+    "TPUB",
+    "TQM",
+    "Tr.",
+    "TRIPs",
+    "tsb.",
+    "tscf.",
+    "t.sh.",
+    "t.s.t.",
+    "TT",
+    "t.t.",
+    "TUDM",
+    "TV",
+    "TVSMR",
+    "TWAIN",
+    "TX",
+    "TYPHIrapid",
+    "U",
+    "Ubat",
+    "UDA",
+    "Udg.",
+    "UFO",
+    "UH",
+    "UIA",
+    "UiTM",
+    "UK",
+    "UKM",
+    "UL",
+    "Ul.",
+    "ULC",
+    "UM",
+    "UMNO",
+    "UMS",
+    "UN",
+    "UN/OSCAL",
+    "UNCLE",
+    "UNCTAD",
+    "UNDP",
+    "UNESCO",
+    "UNFCCC",
+    "UNFPA",
+    "UNHCR",
+    "UNICEF",
+    "UNIMAS",
+    "UNTAET",
+    "UPE",
+    "UPM",
+    "UPS",
+    "UPSR",
+    "URL",
+    "US",
+    "USAINS",
+    "USD",
+    "USM",
+    "USNO",
+    "USS",
+    "USSR",
+    "UTC",
+    "UTF",
+    "utk.",
+    "UTM",
+    "V",
+    "VAT",
+    "VCC",
+    "VCD",
+    "VCR",
+    "VD",
+    "VDSC",
+    "VGA",
+    "VHF",
+    "VHS",
+    "VIP",
+    "VMS",
+    "VO",
+    "VOA",
+    "VoIP",
+    "VR",
+    "VSOP",
+    "VW",
+    "W",
+    "W/O",
+    "WAP",
+    "WAY",
+    "WC",
+    "WDDM",
+    "WDM",
+    "WHO",
+    "Why.",
+    "WIM",
+    "WPG",
+    "WTO",
+    "WWF",
+    "WWW",
+    "WYSIWYG",
+    "Xe",
+    "XO",
+    "XXL",
+    "Y",
+    "Y2K",
+    "YAB",
+    "Yak.",
+    "YAM",
+    "YAS",
+    "YB",
+    "Yb",
+    "Yeh.",
+    "Yer.",
+    "Yes.",
+    "yg.",
+    "Yl.",
+    "YM",
+    "YMCA",
+    "Yoh.",
+    "Yos.",
+    "Y.Th.",
+    "YTM",
+    "Yud.",
+    "Yun.",
+    "Za.",
+    "Zec.",
+    "Zef.",
+    "Zep.",
+    "ZIP",
+    "Zn",
+    "Zr",
+]:
+    _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index e079236fd..ef4665ccc 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,12 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+
+from ...language import BaseDefaults, Language
+from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
-from ...pipeline import Lemmatizer
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class NorwegianDefaults(BaseDefaults):
diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py
index 8f2933670..a1fdb872a 100644
--- a/spacy/lang/nb/punctuation.py
+++ b/spacy/lang/nb/punctuation.py
@@ -1,7 +1,17 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    UNITS,
+)
 
 # Punctuation adapted from Danish
 _quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index d86662693..89a8f5edf 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index 0be436ae4..9b99a1d65 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py
index 0028d1b0b..5c9e6870e 100644
--- a/spacy/lang/ne/__init__.py
+++ b/spacy/lang/ne/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class NepaliDefaults(BaseDefaults):
diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py
index 7cb01c515..91d5b0eb5 100644
--- a/spacy/lang/ne/lex_attrs.py
+++ b/spacy/lang/ne/lex_attrs.py
@@ -1,6 +1,5 @@
+from ...attrs import LIKE_NUM, NORM
 from ..norm_exceptions import BASE_NORMS
-from ...attrs import NORM, LIKE_NUM
-
 
 # fmt: off
 _stem_suffixes = [
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index ad2205a0b..213041a85 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,15 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from thinc.api import Model
 
+from ...language import BaseDefaults, Language
 from .lemmatizer import DutchLemmatizer
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
 
 
 class DutchDefaults(BaseDefaults):
diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py
index f1acaefeb..488224c2f 100644
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = set(
     """
 nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py
index d9dd2a6e3..c9a4c9eeb 100644
--- a/spacy/lang/nl/punctuation.py
+++ b/spacy/lang/nl/punctuation.py
@@ -1,10 +1,19 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_UNITS, merge_chars
-from ..char_classes import LIST_PUNCT, LIST_QUOTES, CURRENCY, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    LIST_UNITS,
+    PUNCT,
+    merge_chars,
+)
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 
-
 _prefixes = [",,"] + BASE_TOKENIZER_PREFIXES
 
 
diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py
index be9beabe6..d7388a333 100644
--- a/spacy/lang/nl/syntax_iterators.py
+++ b/spacy/lang/nl/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py
index 489d10d71..85ad49f14 100644
--- a/spacy/lang/nl/tokenizer_exceptions.py
+++ b/spacy/lang/nl/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 # Extensive list of both common and uncommon dutch abbreviations copied from
 # github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 02c96799b..50a3a8e4c 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,15 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from thinc.api import Model
 
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .lemmatizer import PolishLemmatizer
+from ...language import BaseDefaults, Language
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-
+from .lemmatizer import PolishLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
 
 TOKENIZER_EXCEPTIONS = {
     exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index 059d0609a..d1d2a9c54 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py
index ce56e28a8..398f52a3c 100644
--- a/spacy/lang/pl/lex_attrs.py
+++ b/spacy/lang/pl/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "zero",
     "jeden",
diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
index 31e56b9ae..84ff239ed 100644
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@@ -1,6 +1,17 @@
-from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
-from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_HYPHENS,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    UNITS,
+)
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 
 _quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 454002491..be4041f8e 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class PortugueseDefaults(BaseDefaults):
diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py
index 3c6979ab4..de6a67f14 100644
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "zero",
     "um",
diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py
index 08e31f9d0..b2d63cb3d 100644
--- a/spacy/lang/pt/punctuation.py
+++ b/spacy/lang/pt/punctuation.py
@@ -1,6 +1,6 @@
+from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES
-from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
 
 _prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES
 
diff --git a/spacy/lang/pt/syntax_iterators.py b/spacy/lang/pt/syntax_iterators.py
index 62661f5e4..11017aace 100644
--- a/spacy/lang/pt/syntax_iterators.py
+++ b/spacy/lang/pt/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py
index 187fc65ea..e369eda80 100644
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index a1cfe6224..e4a6392c8 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,7 +1,19 @@
-from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
-from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-
+from .char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    COMBINING_DIACRITICS,
+    CONCAT_QUOTES,
+    CURRENCY,
+    HYPHENS,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    UNITS,
+)
 
 TOKENIZER_PREFIXES = (
     ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
index 50027ffd2..441fefbb6 100644
--- a/spacy/lang/ro/__init__.py
+++ b/spacy/lang/ro/__init__.py
@@ -1,9 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 # Lemma data note:
 # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py
index 0f86f53cd..736aa911a 100644
--- a/spacy/lang/ro/lex_attrs.py
+++ b/spacy/lang/ro/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = set(
     """
 zero unu doi două trei patru cinci șase șapte opt nouă zece
diff --git a/spacy/lang/ro/punctuation.py b/spacy/lang/ro/punctuation.py
index 529e1c977..7259f9ae7 100644
--- a/spacy/lang/ro/punctuation.py
+++ b/spacy/lang/ro/punctuation.py
@@ -1,9 +1,18 @@
 import itertools
 
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+)
 
 _list_icons = [x for x in LIST_ICONS if x != "°"]
 _list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py
index b8af0b1d6..a397b2754 100644
--- a/spacy/lang/ro/tokenizer_exceptions.py
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@@ -1,9 +1,8 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .punctuation import _make_ro_variants
 
-
 _exc = {}
 
 
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 7d17628c4..880965b70 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,13 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
 
+from ...language import BaseDefaults, Language
+from ..punctuation import (
+    COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+    COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from .lemmatizer import RussianLemmatizer
+from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from .lemmatizer import RussianLemmatizer
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
 
 
 class RussianDefaults(BaseDefaults):
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index f4a35de38..1e41220f3 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Tuple, Callable
+from typing import Callable, Dict, List, Optional, Tuple
 
 from thinc.api import Model
 
@@ -8,7 +8,6 @@ from ...symbols import POS
 from ...tokens import Token
 from ...vocab import Vocab
 
-
 PUNCT_RULES = {"«": '"', "»": '"'}
 
 
diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py
index 2afe47623..e0b35bdc0 100644
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = list(
     set(
         """
diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py
index e1889f785..0a8c476b1 100644
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/sa/__init__.py b/spacy/lang/sa/__init__.py
index 61398af6c..c7c0e98e6 100644
--- a/spacy/lang/sa/__init__.py
+++ b/spacy/lang/sa/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class SanskritDefaults(BaseDefaults):
diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py
index 971cee3c6..08d0937b1 100644
--- a/spacy/lang/si/__init__.py
+++ b/spacy/lang/si/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class SinhalaDefaults(BaseDefaults):
diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py
index da6e3048e..2ed7448d2 100644
--- a/spacy/lang/sk/__init__.py
+++ b/spacy/lang/sk/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class SlovakDefaults(BaseDefaults):
diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py
index 0070e9fa1..cd3d70fc9 100644
--- a/spacy/lang/sl/__init__.py
+++ b/spacy/lang/sl/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
 
 
 class SlovenianDefaults(BaseDefaults):
diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py
index 958152e37..3c1493050 100644
--- a/spacy/lang/sl/lex_attrs.py
+++ b/spacy/lang/sl/lex_attrs.py
@@ -1,7 +1,6 @@
-from ...attrs import LIKE_NUM
-from ...attrs import IS_CURRENCY
 import unicodedata
 
+from ...attrs import IS_CURRENCY, LIKE_NUM
 
 _num_words = set(
     """
diff --git a/spacy/lang/sl/punctuation.py b/spacy/lang/sl/punctuation.py
index b6ca1830e..dadb54d31 100644
--- a/spacy/lang/sl/punctuation.py
+++ b/spacy/lang/sl/punctuation.py
@@ -1,20 +1,21 @@
 from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    HYPHENS,
+    LIST_CURRENCY,
     LIST_ELLIPSES,
     LIST_ICONS,
-    HYPHENS,
     LIST_PUNCT,
     LIST_QUOTES,
-    CURRENCY,
-    UNITS,
     PUNCT,
-    LIST_CURRENCY,
-    CONCAT_QUOTES,
+    UNITS,
+    merge_chars,
 )
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import merge_chars
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 
-
 INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
 
 _prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/sl/tokenizer_exceptions.py b/spacy/lang/sl/tokenizer_exceptions.py
index 3d4109228..ec4ea9e41 100644
--- a/spacy/lang/sl/tokenizer_exceptions.py
+++ b/spacy/lang/sl/tokenizer_exceptions.py
@@ -1,7 +1,8 @@
 from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+
+from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc: Dict[str, List[Dict]] = {}
 
diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py
index 5e32a0cbe..1c8a5acf8 100644
--- a/spacy/lang/sq/__init__.py
+++ b/spacy/lang/sq/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
 
 
 class AlbanianDefaults(BaseDefaults):
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index fd0c8c832..5f121d79e 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -1,11 +1,14 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
 
 
 class SerbianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py
index dc48909bc..696b9fd74 100644
--- a/spacy/lang/sr/lex_attrs.py
+++ b/spacy/lang/sr/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "нула",
     "један",
diff --git a/spacy/lang/sr/punctuation.py b/spacy/lang/sr/punctuation.py
new file mode 100644
index 000000000..cafb0f68f
--- /dev/null
+++ b/spacy/lang/sr/punctuation.py
@@ -0,0 +1,45 @@
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    PUNCT,
+    UNITS,
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+_suffixes = (
+    LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[{a}{e}{p}(?:{q})])\.".format(
+            a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+        ),
+    ]
+)
+
+TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py
index dcaa3e239..b7db0aadc 100755
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 28e5085a8..bb4ee1702 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
 from ...pipeline import Lemmatizer
+from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class SwedishDefaults(BaseDefaults):
diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py
index f8ada9e2e..8eeafede8 100644
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "noll",
     "en",
diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py
index 67f1bcdc4..64f1da989 100644
--- a/spacy/lang/sv/punctuation.py
+++ b/spacy/lang/sv/punctuation.py
@@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 from ..punctuation import TOKENIZER_SUFFIXES
 
-
 _quotes = CONCAT_QUOTES.replace("'", "")
 
 _infixes = (
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 06ad016ac..09153a8ec 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
 
-from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
 from ...tokens import Doc, Span
 
 
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index ce7db895a..8fd3afbe3 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import NORM, ORTH
 from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index 4929a4b97..7fd29371a 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class TamilDefaults(BaseDefaults):
diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py
index f830f4ac9..d66125552 100644
--- a/spacy/lang/ta/lex_attrs.py
+++ b/spacy/lang/ta/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _numeral_suffixes = {"பத்து": "பது", "ற்று": "று", "ரத்து": "ரம்", "சத்து": "சம்"}
 _num_words = [
     "பூச்சியம்",
diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py
index 77cc2fe9b..611e9746a 100644
--- a/spacy/lang/te/__init__.py
+++ b/spacy/lang/te/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class TeluguDefaults(BaseDefaults):
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 12b1527e0..bd29d32a4 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -1,10 +1,9 @@
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
 from ...tokens import Doc
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...util import DummyTokenizer, load_config_from_str, registry
 from ...vocab import Vocab
-
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
 
 DEFAULT_CONFIG = """
 [nlp]
diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py
index bc4e5293e..80f6ccbe8 100644
--- a/spacy/lang/th/lex_attrs.py
+++ b/spacy/lang/th/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "ศูนย์",
     "หนึ่ง",
diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py
index 92116d474..954766d28 100644
--- a/spacy/lang/th/tokenizer_exceptions.py
+++ b/spacy/lang/th/tokenizer_exceptions.py
@@ -1,6 +1,5 @@
 from ...symbols import ORTH
 
-
 _exc = {
     # หน่วยงานรัฐ / government agency
     "กกต.": [{ORTH: "กกต."}],
diff --git a/spacy/lang/ti/__init__.py b/spacy/lang/ti/__init__.py
index c74c081b5..510999f67 100644
--- a/spacy/lang/ti/__init__.py
+++ b/spacy/lang/ti/__init__.py
@@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
 
 
 class TigrinyaDefaults(BaseDefaults):
diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py
index aa884c2ba..f29f30e26 100644
--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+    ALPHA_UPPER,
+    CURRENCY,
+    LIST_ELLIPSES,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    UNITS,
+)
 
 _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 
diff --git a/spacy/lang/ti/tokenizer_exceptions.py b/spacy/lang/ti/tokenizer_exceptions.py
index 3d79cd84b..711e4b406 100644
--- a/spacy/lang/ti/tokenizer_exceptions.py
+++ b/spacy/lang/ti/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
 
 _exc = {}
 
diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py
index 30838890a..6849810ef 100644
--- a/spacy/lang/tl/__init__.py
+++ b/spacy/lang/tl/__init__.py
@@ -1,7 +1,7 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class TagalogDefaults(BaseDefaults):
diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py
index 60bdc923b..8866453a0 100644
--- a/spacy/lang/tl/lex_attrs.py
+++ b/spacy/lang/tl/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "sero",
     "isa",
diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py
index 51ad12d9f..b10c90437 100644
--- a/spacy/lang/tl/tokenizer_exceptions.py
+++ b/spacy/lang/tl/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {
     "tayo'y": [{ORTH: "tayo"}, {ORTH: "'y", NORM: "ay"}],
diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
index 28e887eea..4cb8a1635 100644
--- a/spacy/lang/tn/__init__.py
+++ b/spacy/lang/tn/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class SetswanaDefaults(BaseDefaults):
diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py
index a52755564..54d76fbaf 100644
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 
 _infixes = (
     LIST_ELLIPSES
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index d76fe4262..dbf9aab49 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
 import re
 
+from ..symbols import NORM, ORTH
 from .char_classes import ALPHA_LOWER
-from ..symbols import ORTH, NORM
-
 
 # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 # and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index 02b5c7bf4..9aa752168 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
 
 
 class TurkishDefaults(BaseDefaults):
diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 6d9f4f388..2189932b6 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 # Thirteen, fifteen etc. are written separate: on üç
 
 _num_words = [
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
index 769af1223..ed588424a 100644
--- a/spacy/lang/tr/syntax_iterators.py
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+from typing import Iterator, Tuple, Union
+
 from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
index 22fa9f09e..d095a3d0e 100644
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
 import re
 
-from ..punctuation import ALPHA_LOWER, ALPHA
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
+from ..punctuation import ALPHA, ALPHA_LOWER
 
 _exc = {}
 
diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py
index d5e1e87ef..ce04d09c2 100644
--- a/spacy/lang/tt/__init__.py
+++ b/spacy/lang/tt/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
 
 
 class TatarDefaults(BaseDefaults):
diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py
index f644a8ccb..5c233df7c 100644
--- a/spacy/lang/tt/punctuation.py
+++ b/spacy/lang/tt/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+)
 
 _hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
 _infixes = (
diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py
index 3b8cc86b5..280b9f866 100644
--- a/spacy/lang/tt/tokenizer_exceptions.py
+++ b/spacy/lang/tt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index bfea9ff69..5dd75a2a4 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -1,14 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from thinc.api import Model
 
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+from ...language import BaseDefaults, Language
+from ..punctuation import (
+    COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+    COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
 from .lemmatizer import UkrainianLemmatizer
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class UkrainianDefaults(BaseDefaults):
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 37015cc2a..9ec582b76 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -1,10 +1,10 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from thinc.api import Model
 
-from ..ru.lemmatizer import RussianLemmatizer
 from ...pipeline.lemmatizer import lemmatizer_score
 from ...vocab import Vocab
+from ..ru.lemmatizer import RussianLemmatizer
 
 
 class UkrainianLemmatizer(RussianLemmatizer):
diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py
index 7e168a27c..07dd941af 100644
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
 _exc = {}
 
diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py
index 266c5a73d..4f20ac92f 100644
--- a/spacy/lang/ur/__init__.py
+++ b/spacy/lang/ur/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class UrduDefaults(BaseDefaults):
diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py
index 5d35d0a25..382bfc75c 100644
--- a/spacy/lang/ur/punctuation.py
+++ b/spacy/lang/ur/punctuation.py
@@ -1,4 +1,3 @@
 from ..punctuation import TOKENIZER_SUFFIXES
 
-
 _suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 822dc348c..a621b8bfe 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,17 +1,17 @@
-from typing import Any, Dict, Union
-from pathlib import Path
 import re
-import srsly
 import string
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import srsly
 
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
-from ...tokens import Doc
-from ...util import DummyTokenizer, registry, load_config_from_str
-from ...vocab import Vocab
 from ... import util
-
+from ...language import BaseDefaults, Language
+from ...tokens import Doc
+from ...util import DummyTokenizer, load_config_from_str, registry
+from ...vocab import Vocab
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
 
 DEFAULT_CONFIG = """
 [nlp]
diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index 0cbda4ffb..82997a133 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -1,6 +1,5 @@
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "không",  # Zero
     "một",  # One
diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py
index 6c38ec8af..93c4ca493 100644
--- a/spacy/lang/yo/__init__.py
+++ b/spacy/lang/yo/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
 from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
 
 
 class YorubaDefaults(BaseDefaults):
diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py
index ead68ced2..5f33e06a5 100644
--- a/spacy/lang/yo/lex_attrs.py
+++ b/spacy/lang/yo/lex_attrs.py
@@ -2,7 +2,6 @@ import unicodedata
 
 from ...attrs import LIKE_NUM
 
-
 _num_words = [
     "ení",
     "oókàn",
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index fdf6776e2..f7bb09277 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,21 +1,21 @@
-from typing import Optional, List, Dict, Any, Callable, Iterable
-from enum import Enum
 import tempfile
-import srsly
 import warnings
+from enum import Enum
 from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional
 
-from ...errors import Warnings, Errors
-from ...language import Language, BaseDefaults
+import srsly
+
+from ... import util
+from ...errors import Errors, Warnings
+from ...language import BaseDefaults, Language
 from ...scorer import Scorer
 from ...tokens import Doc
-from ...training import validate_examples, Example
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...training import Example, validate_examples
+from ...util import DummyTokenizer, load_config_from_str, registry
 from ...vocab import Vocab
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
-from ... import util
-
 
 # fmt: off
 _PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install \"spacy-pkuseg>=0.0.27,<0.1.0\"` or `conda install -c conda-forge \"spacy-pkuseg>=0.0.27,<0.1.0\"`"
diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py
index 08c8e3160..36fa7310a 100644
--- a/spacy/lang/zh/lex_attrs.py
+++ b/spacy/lang/zh/lex_attrs.py
@@ -2,7 +2,6 @@ import re
 
 from ...attrs import LIKE_NUM
 
-
 _single_num_words = [
     "〇",
     "一",
diff --git a/spacy/language.py b/spacy/language.py
index 9fdcf6328..fd616483b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,47 +1,70 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
 import functools
+import itertools
+import multiprocessing as mp
+import random
+import traceback
+import warnings
 from contextlib import contextmanager
 from copy import deepcopy
-from pathlib import Path
-import warnings
-
-from thinc.api import get_current_ops, Config, CupyOps, Optimizer
-import srsly
-import multiprocessing as mp
+from dataclasses import dataclass
 from itertools import chain, cycle
+from pathlib import Path
 from timeit import default_timer as timer
-import traceback
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    NoReturn,
+    Optional,
+    Pattern,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+    overload,
+)
 
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
-from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
-from .git_info import GIT_VERSION
-from . import util
-from . import about
-from .lookups import load_lookups
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
 from .compat import Literal
-
+from .errors import Errors, Warnings
+from .git_info import GIT_VERSION
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from .lookups import load_lookups
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+    ConfigSchema,
+    ConfigSchemaInit,
+    ConfigSchemaNlp,
+    ConfigSchemaPretrain,
+    validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+    _DEFAULT_EMPTY_PIPES,
+    CONFIG_SECTION_ORDER,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    _pipe,
+    combine_score_weights,
+    raise_error,
+    registry,
+    warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
 
 PipeCallable = Callable[[Doc], Doc]
 
@@ -716,6 +739,11 @@ class Language:
                 )
             )
         pipe = source.get_pipe(source_name)
+        # There is no actual solution here. Either the component has the right
+        # name for the source pipeline or the component has the right name for
+        # the current pipeline. This prioritizes the current pipeline.
+        if hasattr(pipe, "name"):
+            pipe.name = name
         # Make sure the source config is interpolated so we don't end up with
         # orphaned variables in our final config
         source_config = source.config.interpolate()
@@ -793,6 +821,7 @@ class Language:
         pipe_index = self._get_pipe_index(before, after, first, last)
         self._pipe_meta[name] = self.get_factory_meta(factory_name)
         self._components.insert(pipe_index, (name, pipe_component))
+        self._link_components()
         return pipe_component
 
     def _get_pipe_index(
@@ -928,6 +957,7 @@ class Language:
         if old_name in self._config["initialize"]["components"]:
             init_cfg = self._config["initialize"]["components"].pop(old_name)
             self._config["initialize"]["components"][new_name] = init_cfg
+        self._link_components()
 
     def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
         """Remove a component from the pipeline.
@@ -951,6 +981,7 @@ class Language:
         # Make sure the name is also removed from the set of disabled components
         if name in self.disabled:
             self._disabled.remove(name)
+        self._link_components()
         return removed
 
     def disable_pipe(self, name: str) -> None:
@@ -1269,7 +1300,10 @@ class Language:
                 "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
             )
             doc = Doc(self.vocab, words=["x", "y", "z"])
-            get_examples = lambda: [Example.from_dict(doc, {})]
+
+            def get_examples():
+                return [Example.from_dict(doc, {})]
+
         if not hasattr(get_examples, "__call__"):
             err = Errors.E930.format(
                 method="Language.initialize", obj=type(get_examples)
@@ -1372,6 +1406,7 @@ class Language:
         scorer: Optional[Scorer] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         scorer_cfg: Optional[Dict[str, Any]] = None,
+        per_component: bool = False,
     ) -> Dict[str, Any]:
         """Evaluate a model's pipeline components.
 
@@ -1383,6 +1418,8 @@ class Language:
             arguments for specific components.
         scorer_cfg (dict): An optional dictionary with extra keyword arguments
             for the scorer.
+        per_component (bool): Whether to return the scores keyed by component
+            name. Defaults to False.
 
         RETURNS (Scorer): The scorer containing the evaluation results.
 
@@ -1415,7 +1452,7 @@ class Language:
         for eg, doc in zip(examples, docs):
             eg.predicted = doc
         end_time = timer()
-        results = scorer.score(examples)
+        results = scorer.score(examples, per_component=per_component)
         n_words = sum(len(eg.predicted) for eg in examples)
         results["speed"] = n_words / (end_time - start_time)
         return results
@@ -1673,8 +1710,16 @@ class Language:
         # The problem is we need to do it during deserialization...And the
         # components don't receive the pipeline then. So this does have to be
         # here :(
+        # First, fix up all the internal component names in case they have
+        # gotten out of sync due to sourcing components from different
+        # pipelines, since find_listeners uses proc2.name for the listener
+        # map.
+        for name, proc in self.pipeline:
+            if hasattr(proc, "name"):
+                proc.name = name
         for i, (name1, proc1) in enumerate(self.pipeline):
             if isinstance(proc1, ty.ListenedToComponent):
+                proc1.listener_map = {}
                 for name2, proc2 in self.pipeline[i + 1 :]:
                     proc1.find_listeners(proc2)
 
@@ -1808,6 +1853,7 @@ class Language:
                         raw_config=raw_config,
                     )
                 else:
+                    assert "source" in pipe_cfg
                     # We need the sourced components to reference the same
                     # vocab without modifying the current vocab state **AND**
                     # we still want to load the source model vectors to perform
@@ -1827,6 +1873,10 @@ class Language:
                     source_name = pipe_cfg.get("component", pipe_name)
                     listeners_replaced = False
                     if "replace_listeners" in pipe_cfg:
+                        # Make sure that the listened-to component has the
+                        # state of the source pipeline listener map so that the
+                        # replace_listeners method below works as intended.
+                        source_nlps[model]._link_components()
                         for name, proc in source_nlps[model].pipeline:
                             if source_name in getattr(proc, "listening_components", []):
                                 source_nlps[model].replace_listeners(
@@ -1838,6 +1888,8 @@ class Language:
                         nlp.add_pipe(
                             source_name, source=source_nlps[model], name=pipe_name
                         )
+                        # At this point after nlp.add_pipe, the listener map
+                        # corresponds to the new pipeline.
                     if model not in source_nlp_vectors_hashes:
                         source_nlp_vectors_hashes[model] = hash(
                             source_nlps[model].vocab.vectors.to_bytes(
@@ -1892,27 +1944,6 @@ class Language:
                 raise ValueError(
                     Errors.E942.format(name="pipeline_creation", value=type(nlp))
                 )
-        # Detect components with listeners that are not frozen consistently
-        for name, proc in nlp.pipeline:
-            if isinstance(proc, ty.ListenedToComponent):
-                # Remove listeners not in the pipeline
-                listener_names = proc.listening_components
-                unused_listener_names = [
-                    ll for ll in listener_names if ll not in nlp.pipe_names
-                ]
-                for listener_name in unused_listener_names:
-                    for listener in proc.listener_map.get(listener_name, []):
-                        proc.remove_listener(listener, listener_name)
-
-                for listener_name in proc.listening_components:
-                    # e.g. tok2vec/transformer
-                    # If it's a component sourced from another pipeline, we check if
-                    # the tok2vec listeners should be replaced with standalone tok2vec
-                    # models (e.g. so component can be frozen without its performance
-                    # degrading when other components/tok2vec are updated)
-                    paths = sourced.get(listener_name, {}).get("replace_listeners", [])
-                    if paths:
-                        nlp.replace_listeners(name, listener_name, paths)
         return nlp
 
     def replace_listeners(
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 8dea0d6a2..ff2e4f92e 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,11 +1,20 @@
 from numpy cimport ndarray
 
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
-from .structs cimport LexemeC
+from .attrs cimport (
+    ID,
+    LANG,
+    LENGTH,
+    LOWER,
+    NORM,
+    ORTH,
+    PREFIX,
+    SHAPE,
+    SUFFIX,
+    attr_id_t,
+)
 from .strings cimport StringStore
+from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 9b7a6156a..9980b9fce 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -1,8 +1,7 @@
-from typing import (
-    Union,
-    Any,
-)
+from typing import Any, Union
+
 from thinc.types import Floats1d
+
 from .tokens import Doc, Span, Token
 from .vocab import Vocab
 
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index e70feaf9a..00e2c6258 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,24 +1,40 @@
 # cython: embedsignature=True
 # Compiler crashes on memory view coercion without this. Should report bug.
+cimport numpy as np
 from cython.view cimport array as cvarray
 from libc.string cimport memset
-cimport numpy as np
+
 np.import_array()
 
+import warnings
+
 import numpy
 from thinc.api import get_array_module
-import warnings
 
+from .attrs cimport (
+    IS_ALPHA,
+    IS_ASCII,
+    IS_BRACKET,
+    IS_CURRENCY,
+    IS_DIGIT,
+    IS_LEFT_PUNCT,
+    IS_LOWER,
+    IS_PUNCT,
+    IS_QUOTE,
+    IS_RIGHT_PUNCT,
+    IS_SPACE,
+    IS_STOP,
+    IS_TITLE,
+    IS_UPPER,
+    LIKE_EMAIL,
+    LIKE_NUM,
+    LIKE_URL,
+)
 from .typedefs cimport attr_t, flags_t
-from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from .attrs cimport IS_CURRENCY
 
 from .attrs import intify_attrs
 from .errors import Errors, Warnings
 
-
 OOV_RANK = 0xffffffffffffffff # UINT64_MAX
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.id = OOV_RANK
diff --git a/spacy/lookups.py b/spacy/lookups.py
index d7cc44fb3..1a2c44bfa 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,13 +1,13 @@
-from typing import Any, List, Union, Optional, Dict
+from collections import OrderedDict
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
 import srsly
 from preshed.bloom import BloomFilter
-from collections import OrderedDict
 
 from .errors import Errors
-from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
 from .strings import get_string_id
-
+from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
 
 UNSET = object()
 
diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py
index a4f164847..f671f2e35 100644
--- a/spacy/matcher/__init__.py
+++ b/spacy/matcher/__init__.py
@@ -1,6 +1,6 @@
-from .matcher import Matcher
-from .phrasematcher import PhraseMatcher
 from .dependencymatcher import DependencyMatcher
 from .levenshtein import levenshtein
+from .matcher import Matcher
+from .phrasematcher import PhraseMatcher
 
 __all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi
index c19d3a71c..b9fbabda7 100644
--- a/spacy/matcher/dependencymatcher.pyi
+++ b/spacy/matcher/dependencymatcher.pyi
@@ -1,8 +1,9 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from .matcher import Matcher
-from ..vocab import Vocab
+
 from ..tokens.doc import Doc
 from ..tokens.span import Span
+from ..vocab import Vocab
+from .matcher import Matcher
 
 class DependencyMatcher:
     """Match dependency parse tree based on pattern rules."""
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index adf96702b..a214c0668 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,18 +1,16 @@
 # cython: infer_types=True, profile=True
-from typing import List
+import warnings
 from collections import defaultdict
 from itertools import product
+from typing import List
 
-import warnings
-
-from .matcher cimport Matcher
-from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
+from .matcher cimport Matcher
 
 from ..errors import Errors, Warnings
 from ..tokens import Span
 
-
 DELIMITER = "||"
 INDEX_HEAD = 1
 INDEX_RELOP = 0
@@ -432,22 +430,22 @@ cdef class DependencyMatcher:
         return [doc[child.i] for child in doc[node].head.children if child.i < node]
 
     def _imm_right_child(self, doc, node):
-        for child in doc[node].children:
+        for child in doc[node].rights:
             if child.i == node + 1:
                 return [doc[child.i]]
         return []
 
     def _imm_left_child(self, doc, node):
-        for child in doc[node].children:
+        for child in doc[node].lefts:
             if child.i == node - 1:
                 return [doc[child.i]]
         return []
 
     def _right_child(self, doc, node):
-        return [doc[child.i] for child in doc[node].children if child.i > node]
+        return [child for child in doc[node].rights]
     
     def _left_child(self, doc, node):
-        return [doc[child.i] for child in doc[node].children if child.i < node]
+        return [child for child in doc[node].lefts]
 
     def _imm_right_parent(self, doc, node):
         if doc[node].head.i == node + 1:
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 51854d562..2c82cea1d 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -1,11 +1,11 @@
+from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
 
-from ..vocab cimport Vocab
-from ..typedefs cimport attr_t, hash_t
-from ..structs cimport TokenC
 from ..lexeme cimport attr_id_t
+from ..structs cimport TokenC
+from ..typedefs cimport attr_t, hash_t
+from ..vocab cimport Vocab
 
 
 cdef enum action_t:
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 48922865b..c33b534cb 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,8 +1,19 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union
-from typing import Iterator, Iterable, overload
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
 from ..compat import Literal
-from ..vocab import Vocab
 from ..tokens import Doc, Span
+from ..vocab import Vocab
 
 class Matcher:
     def __init__(
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index b886bd2ec..3d03f37ae 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,32 +1,43 @@
 # cython: binding=True, infer_types=True, profile=True
-from typing import List, Iterable
+from typing import Iterable, List
 
-from libcpp.vector cimport vector
-from libc.stdint cimport int32_t, int8_t
-from libc.string cimport memset, memcmp
 from cymem.cymem cimport Pool
+from libc.stdint cimport int8_t, int32_t
+from libc.string cimport memcmp, memset
+from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
 
 import re
-import srsly
 import warnings
 
-from ..typedefs cimport attr_t
+import srsly
+
+from ..attrs cimport (
+    DEP,
+    ENT_IOB,
+    ID,
+    LEMMA,
+    MORPH,
+    NULL_ATTR,
+    ORTH,
+    POS,
+    TAG,
+    attr_id_t,
+)
 from ..structs cimport TokenC
-from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
+from ..tokens.morphanalysis cimport MorphAnalysis
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
-from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
+from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
 
-from .levenshtein import levenshtein_compare
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings import get_string_id
 from ..attrs import IDS
+from ..errors import Errors, MatchPatternError, Warnings
+from ..schemas import validate_token_pattern
+from ..strings import get_string_id
 from ..util import registry
-
+from .levenshtein import levenshtein_compare
 
 DEF PADDING = 5
 
diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 1bdc19012..bffc1ac97 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,6 +1,6 @@
-from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
-from preshed.maps cimport key_t, MapStruct
+from libcpp.vector cimport vector
+from preshed.maps cimport MapStruct, key_t
 
 from ..attrs cimport attr_id_t
 from ..structs cimport SpanC
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 68e3386e4..27f6ba373 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,12 +1,13 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
+
 from ..compat import Literal
-from .matcher import Matcher
-from ..vocab import Vocab
 from ..tokens import Doc, Span
+from ..vocab import Vocab
+from .matcher import Matcher
 
 class PhraseMatcher:
     def __init__(
-        self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
+        self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ...
     ) -> None: ...
     def __reduce__(self) -> Any: ...
     def __len__(self) -> int: ...
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 382029872..c407cf1cc 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,18 +1,20 @@
 # cython: infer_types=True, profile=True
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
+
 from ..attrs import IDS
+
 from ..structs cimport TokenC
-from ..tokens.token cimport Token
 from ..tokens.span cimport Span
+from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 
-from ..schemas import TokenPattern
 from ..errors import Errors, Warnings
+from ..schemas import TokenPattern
 
 
 cdef class PhraseMatcher:
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index e46735102..89c836144 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -1,4 +1,5 @@
 from typing import List
+
 from thinc.api import Model
 from thinc.types import Floats2d
 
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
index 3b60ec2ab..e2378a7ba 100644
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@@ -1,8 +1,8 @@
-from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
 import functools
 import inspect
 import types
 import warnings
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Type
 
 from thinc.layers import with_nvtx_range
 from thinc.model import Model, wrap_model_recursive
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index c9c82f369..ce7c585cc 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -1,7 +1,7 @@
 from thinc.api import Model
 
-from ..util import registry
 from ..attrs import LOWER
+from ..util import registry
 
 
 @registry.layers("spacy.extract_ngrams.v1")
diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py
index d5e9bc07c..ac0f5fa1b 100644
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@@ -1,6 +1,7 @@
-from typing import Tuple, Callable
+from typing import Callable, List, Tuple
+
 from thinc.api import Model, to_numpy
-from thinc.types import Ragged, Ints1d
+from thinc.types import Ints1d, Ragged
 
 from ..util import registry
 
@@ -52,14 +53,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
     indices will be [5, 6, 7, 8, 8, 9].
     """
     spans, lengths = _ensure_cpu(spans, lengths)
-    indices = []
+    indices: List[int] = []
     offset = 0
     for i, length in enumerate(lengths):
         spans_i = spans[i].dataXd + offset
         for j in range(spans_i.shape[0]):
-            indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))  # type: ignore[call-overload, index]
+            indices.extend(range(spans_i[j, 0], spans_i[j, 1]))  # type: ignore[arg-type, call-overload]
         offset += length
-    return ops.flatten(indices, dtype="i", ndim_if_empty=1)
+    return ops.asarray1i(indices)
 
 
 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
index ed2918f02..06f1ff51a 100644
--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@@ -1,6 +1,7 @@
-from typing import List, Union, Callable, Tuple
-from thinc.types import Ints2d
+from typing import Callable, List, Tuple, Union
+
 from thinc.api import Model, registry
+from thinc.types import Ints2d
 
 from ..tokens import Doc
 
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index 9b7628f0e..5125018e5 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -1,6 +1,7 @@
 from .entity_linker import *  # noqa
 from .multi_task import *  # noqa
 from .parser import *  # noqa
+from .span_finder import *  # noqa
 from .spancat import *  # noqa
 from .tagger import *  # noqa
 from .textcat import *  # noqa
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 7332ca199..b7100c00a 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,16 +1,31 @@
 from pathlib import Path
-from typing import Optional, Callable, Iterable, List, Tuple
-from thinc.types import Floats2d
-from thinc.api import chain, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear, tuplify, Ragged
+from typing import Callable, Iterable, List, Optional, Tuple
+
+from thinc.api import (
+    Linear,
+    Maxout,
+    Model,
+    Ragged,
+    chain,
+    list2ragged,
+    reduce_mean,
+    residual,
+    tuplify,
+)
+from thinc.types import Floats2d
 
-from ...util import registry
-from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate, get_candidates, get_candidates_batch
-from ...vocab import Vocab
-from ...tokens import Span, Doc
-from ..extract_spans import extract_spans
 from ...errors import Errors
+from ...kb import (
+    Candidate,
+    InMemoryLookupKB,
+    KnowledgeBase,
+    get_candidates,
+    get_candidates_batch,
+)
+from ...tokens import Doc, Span
+from ...util import registry
+from ...vocab import Vocab
+from ..extract_spans import extract_spans
 
 
 @registry.architectures("spacy.EntityLinker.v2")
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 826fddd4f..b7faf1cd7 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -1,22 +1,33 @@
-from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
-from thinc.types import Floats2d
-from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
-from thinc.api import MultiSoftmax, list2array
-from thinc.api import to_categorical, CosineDistance, L2Distance
-from thinc.loss import Loss
-
-from ...util import registry, OOV_RANK
-from ...errors import Errors
-from ...attrs import ID
-from ...vectors import Mode as VectorsMode
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, cast
 
 import numpy
-from functools import partial
+from thinc.api import (
+    CosineDistance,
+    L2Distance,
+    LayerNorm,
+    Linear,
+    Maxout,
+    Model,
+    MultiSoftmax,
+    Softmax,
+    chain,
+    list2array,
+    to_categorical,
+    zero_init,
+)
+from thinc.loss import Loss
+from thinc.types import Floats2d, Ints1d
+
+from ...attrs import ID, ORTH
+from ...errors import Errors
+from ...util import OOV_RANK, registry
+from ...vectors import Mode as VectorsMode
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
-    from ...vocab import Vocab  # noqa: F401
     from ...tokens.doc import Doc  # noqa: F401
+    from ...vocab import Vocab  # noqa: F401
 
 
 @registry.architectures("spacy.PretrainVectors.v1")
@@ -24,8 +35,6 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.mode != VectorsMode.default:
-            raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
         if vocab.vectors.shape[1] == 0:
             raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
@@ -70,14 +79,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
     """Compute a loss based on a distance between the documents' vectors and
     the prediction.
     """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    target[ids == OOV_RANK] = 0
-    d_target, loss = distance(prediction, target)
+    vocab = docs[0].vocab
+    if vocab.vectors.mode == VectorsMode.default:
+        # The simplest way to implement this would be to vstack the
+        # token.vector values, but that's a bit inefficient, especially on GPU.
+        # Instead we fetch the index into the vectors table for each of our
+        # tokens, and look them up all at once. This prevents data copying.
+        ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+        target = docs[0].vocab.vectors.data[ids]
+        target[ids == OOV_RANK] = 0
+        d_target, loss = distance(prediction, target)
+    elif vocab.vectors.mode == VectorsMode.floret:
+        keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
+        target = vocab.vectors.get_batch(keys)
+        target = ops.as_contig(target)
+        d_target, loss = distance(prediction, target)
+    else:
+        raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
     return loss, d_target
 
 
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea..f6c0e565d 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...errors import Errors
 from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
 
 
 @registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/models/span_finder.py b/spacy/ml/models/span_finder.py
new file mode 100644
index 000000000..d327fc761
--- /dev/null
+++ b/spacy/ml/models/span_finder.py
@@ -0,0 +1,41 @@
+from typing import Callable, List, Tuple
+
+from thinc.api import Model, chain, with_array
+from thinc.types import Floats1d, Floats2d
+
+from ...tokens import Doc
+from ...util import registry
+
+InT = List[Doc]
+OutT = Floats2d
+
+
+@registry.architectures("spacy.SpanFinder.v1")
+def build_finder_model(
+    tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
+) -> Model[InT, OutT]:
+
+    logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
+    model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("scorer", scorer)
+    model.set_ref("logistic_layer", logistic_layer)
+
+    return model
+
+
+def flattener() -> Model[List[Floats2d], Floats2d]:
+    """Flattens the input to a 1-dimensional list of scores"""
+
+    def forward(
+        model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
+    ) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
+        lens = model.ops.asarray1i([len(doc) for doc in X])
+        Y = model.ops.flatten(X)
+
+        def backprop(dY: Floats2d) -> List[Floats2d]:
+            return model.ops.unflatten(dY, lens)
+
+        return Y, backprop
+
+    return Model("Flattener", forward=forward)
diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py
index 893db2e6d..140ec553a 100644
--- a/spacy/ml/models/spancat.py
+++ b/spacy/ml/models/spancat.py
@@ -1,11 +1,24 @@
 from typing import List, Tuple, cast
-from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
-from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
-from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
-from thinc.types import Ragged, Floats2d
 
-from ...util import registry
+from thinc.api import (
+    Linear,
+    Logistic,
+    Maxout,
+    Model,
+    chain,
+    concatenate,
+    glorot_uniform_init,
+    list2ragged,
+    reduce_first,
+    reduce_last,
+    reduce_max,
+    reduce_mean,
+    with_getitem,
+)
+from thinc.types import Floats2d, Ragged
+
 from ...tokens import Doc
+from ...util import registry
 from ..extract_spans import extract_spans
 
 
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 9f8ef7b2b..8f1554fab 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -1,9 +1,10 @@
-from typing import Optional, List
-from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
+from typing import List, Optional
+
+from thinc.api import Model, Softmax_v2, chain, with_array, zero_init
 from thinc.types import Floats2d
 
-from ...util import registry
 from ...tokens import Doc
+from ...util import registry
 
 
 @registry.architectures("spacy.Tagger.v2")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 9c7e607fe..ab14110d2 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,22 +1,39 @@
-from typing import Optional, List, cast
 from functools import partial
+from typing import List, Optional, cast
 
-from thinc.types import Floats2d
-from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
-from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
-from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable
+from thinc.api import (
+    Dropout,
+    LayerNorm,
+    Linear,
+    Logistic,
+    Maxout,
+    Model,
+    ParametricAttention,
+    Relu,
+    Softmax,
+    SparseLinear,
+    chain,
+    clone,
+    concatenate,
+    list2ragged,
+    reduce_mean,
+    reduce_sum,
+    residual,
+    resizable,
+    softmax_activation,
+    with_cpu,
+)
 from thinc.layers.chain import init as init_chain
-from thinc.layers.resizable import resize_model, resize_linear_weighted
+from thinc.layers.resizable import resize_linear_weighted, resize_model
+from thinc.types import Floats2d
 
 from ...attrs import ORTH
+from ...tokens import Doc
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
-from ...tokens import Doc
 from .tok2vec import get_tok2vec_width
 
-
 NEG_VALUE = -5000
 
 
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 30c7360ff..2e9d21ef4 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,17 +1,32 @@
-from typing import Optional, List, Union, cast
-from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
-from thinc.api import chain, clone, concatenate, with_array, with_padded
-from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
-from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
+from typing import List, Optional, Union, cast
 
-from ...tokens import Doc
-from ...util import registry
+from thinc.api import (
+    HashEmbed,
+    Maxout,
+    Mish,
+    Model,
+    PyTorchLSTM,
+    chain,
+    clone,
+    concatenate,
+    expand_window,
+    list2ragged,
+    noop,
+    ragged2list,
+    residual,
+    with_array,
+    with_padded,
+)
+from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
+
+from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import _character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import intify_attr
+from ...tokens import Doc
+from ...util import registry
+from ..featureextractor import FeatureExtractor
+from ..staticvectors import StaticVectors
 
 
 @registry.architectures("spacy.Tok2VecListener.v1")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea5..ca31c1699 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
 from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
 
 
 cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 961bf4d70..5cffc4c2d 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,19 +1,20 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
 cimport numpy as np
 from libc.math cimport exp
-from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.linalg cimport Vec, VecVec
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
+from thinc.backends.linalg cimport Vec, VecVec
 
 import numpy
 import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
 
 from .. import util
 from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
 from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 04cfe912d..b75240c5d 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,11 +1,14 @@
-from typing import List, Tuple, Callable, Optional, Sequence, cast
-from thinc.initializers import glorot_uniform_init
-from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
-from thinc.api import Model, Ops, registry
+import warnings
+from typing import Callable, List, Optional, Sequence, Tuple, cast
 
+from thinc.api import Model, Ops, registry
+from thinc.initializers import glorot_uniform_init
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from thinc.util import partial
+
+from ..attrs import ORTH
+from ..errors import Errors, Warnings
 from ..tokens import Doc
-from ..errors import Errors
 from ..vectors import Mode
 from ..vocab import Vocab
 
@@ -23,6 +26,8 @@ def StaticVectors(
     linear projection to control the dimensionality. If a dropout rate is
     specified, the dropout is applied per dimension over the whole batch.
     """
+    if key_attr != "ORTH":
+        warnings.warn(Warnings.W125, DeprecationWarning)
     return Model(
         "static_vectors",
         forward,
@@ -39,9 +44,9 @@ def forward(
     token_count = sum(len(doc) for doc in docs)
     if not token_count:
         return _handle_empty(model.ops, model.get_dim("nO"))
-    key_attr: int = model.attrs["key_attr"]
-    keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
     vocab: Vocab = docs[0].vocab
+    key_attr: int = getattr(vocab.vectors, "attr", ORTH)
+    keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
     if vocab.vectors.mode == Mode.default:
         V = model.ops.asarray(vocab.vectors.data)
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e2..e351ad4e5 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 8d449d065..968764b82 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,10 +1,10 @@
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
 cimport numpy as np
+from cymem.cymem cimport Pool
 from libc.stdint cimport uint64_t
+from preshed.maps cimport PreshMap
 
-from .structs cimport MorphAnalysisC
 from .strings cimport StringStore
+from .structs cimport MorphAnalysisC
 from .typedefs cimport attr_t, hash_t
 
 
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c3ffc46a1..1062fff09 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,12 +1,13 @@
 # cython: infer_types
-import numpy
 import warnings
 
+import numpy
+
 from .attrs cimport POS
 
-from .parts_of_speech import IDS as POS_IDS
-from .errors import Warnings
 from . import symbols
+from .errors import Warnings
+from .parts_of_speech import IDS as POS_IDS
 
 
 cdef class Morphology:
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 0bf5b4789..a0b2567f1 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -1,5 +1,6 @@
 from . cimport symbols
 
+
 cpdef enum univ_pos_t:
     NO_TAG = 0
     ADJ = symbols.ADJ
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index 245747061..d26884487 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -1,8 +1,9 @@
-from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, ItemsView, Iterable, List, Set, Union
+
 from wasabi import msg
 
-from .tokens import Doc, Token, Span
 from .errors import Errors
+from .tokens import Doc, Span, Token
 from .util import dot_to_dict
 
 if TYPE_CHECKING:
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 26931606b..40e3fd638 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -2,21 +2,22 @@ from .attributeruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
-from .ner import EntityRecognizer
 from .entityruler import EntityRuler
+from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
+from .ner import EntityRecognizer
 from .pipe import Pipe
-from .trainable_pipe import TrainablePipe
-from .senter import SentenceRecognizer
 from .sentencizer import Sentencizer
+from .senter import SentenceRecognizer
+from .span_finder import SpanFinder
+from .span_ruler import SpanRuler
+from .spancat import SpanCategorizer
 from .tagger import Tagger
 from .textcat import TextCategorizer
-from .spancat import SpanCategorizer
-from .span_ruler import SpanRuler
 from .textcat_multilabel import MultiLabel_TextCategorizer
 from .tok2vec import Tok2Vec
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .trainable_pipe import TrainablePipe
 
 __all__ = [
     "AttributeRuler",
@@ -31,6 +32,7 @@ __all__ = [
     "SentenceRecognizer",
     "Sentencizer",
     "SpanCategorizer",
+    "SpanFinder",
     "SpanRuler",
     "Tagger",
     "TextCategorizer",
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
index dc4289f37..3d63af921 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
@@ -2,8 +2,9 @@ from libc.stdint cimport uint32_t, uint64_t
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 
-from ...typedefs cimport attr_t, hash_t, len_t
 from ...strings cimport StringStore
+from ...typedefs cimport attr_t, hash_t, len_t
+
 
 cdef extern from "<algorithm>" namespace "std" nogil:
     void swap[T](T& a, T& b) except +  # Only available in Cython 3.
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
index 9d18c0334..daab0d204 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
@@ -1,7 +1,6 @@
 # cython: infer_types=True, binding=True
 from cython.operator cimport dereference as deref
-from libc.stdint cimport uint32_t
-from libc.stdint cimport UINT32_MAX
+from libc.stdint cimport UINT32_MAX, uint32_t
 from libc.string cimport memset
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
@@ -15,7 +14,6 @@ from ...errors import Errors
 from ...strings import StringStore
 from .schemas import validate_edit_tree
 
-
 NULL_TREE_ID = UINT32_MAX
 
 cdef LCS find_lcs(str source, str target):
diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py
index c01d0632e..1e307b66c 100644
--- a/spacy/pipeline/_edit_tree_internals/schemas.py
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@@ -1,5 +1,6 @@
-from typing import Any, Dict, List, Union
 from collections import defaultdict
+from typing import Any, Dict, List, Union
+
 from pydantic import BaseModel, Field, ValidationError
 from pydantic.types import StrictBool, StrictInt, StrictStr
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index de3573fbc..596306b23 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,5 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
+
 # These are passed as callbacks to thinc.search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index fa7df2056..04dd3f11e 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,15 +1,21 @@
 # cython: infer_types=True
 # cython: profile=True
 cimport numpy as np
+
 import numpy
-from cpython.ref cimport PyObject, Py_XDECREF
+
+from cpython.ref cimport Py_XDECREF, PyObject
 from thinc.extra.search cimport Beam
+
 from thinc.extra.search import MaxViolation
+
 from thinc.extra.search cimport MaxViolation
 
-from ...typedefs cimport hash_t, class_t
-from .transition_system cimport TransitionSystem, Transition
+from ...typedefs cimport class_t, hash_t
+from .transition_system cimport Transition, TransitionSystem
+
 from ...errors import Errors
+
 from .stateclass cimport StateC, StateClass
 
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index a1262bb61..24acc350c 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,19 +1,20 @@
-from cython.operator cimport dereference as deref, preincrement as incr
-from libc.string cimport memcpy, memset
-from libc.stdlib cimport calloc, free
-from libc.stdint cimport uint32_t, uint64_t
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
+from cython.operator cimport dereference as deref
+from cython.operator cimport preincrement as incr
+from libc.stdint cimport uint32_t, uint64_t
+from libc.stdlib cimport calloc, free
+from libc.string cimport memcpy, memset
+from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
-from ...vocab cimport EMPTY_LEXEME
-from ...structs cimport TokenC, SpanC
-from ...lexeme cimport Lexeme
 from ...attrs cimport IS_SPACE
+from ...lexeme cimport Lexeme
+from ...structs cimport SpanC, TokenC
 from ...typedefs cimport attr_t
+from ...vocab cimport EMPTY_LEXEME
 
 
 cdef inline bint is_space_token(const TokenC* token) nogil:
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pxd b/spacy/pipeline/_parser_internals/arc_eager.pxd
index b618bc587..2c17e7b26 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pxd
+++ b/spacy/pipeline/_parser_internals/arc_eager.pxd
@@ -1,5 +1,5 @@
+from ...typedefs cimport attr_t, weight_t
 from ._state cimport StateC
-from ...typedefs cimport weight_t, attr_t
 from .transition_system cimport Transition, TransitionSystem
 
 
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 257b5ef8a..2c9eb0ff5 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,22 +1,27 @@
 # cython: profile=True, cdivision=True, infer_types=True
-from cymem.cymem cimport Pool, Address
+from cymem.cymem cimport Address, Pool
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
 
-from collections import defaultdict, Counter
+from collections import Counter, defaultdict
 
-from ...typedefs cimport hash_t, attr_t
 from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...tokens.token cimport MISSING_DEP
+from ...typedefs cimport attr_t, hash_t
+
 from ...training import split_bilu_label
+
 from ...training.example cimport Example
+from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
-from ._state cimport StateC, ArcC
+
 from ...errors import Errors
+
 from thinc.extra.search cimport Beam
 
+
 cdef weight_t MIN_SCORE = -90000
 cdef attr_t SUBTOK_LABEL = hash_string('subtok')
 
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index fab872f00..e1edb4464 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,22 +1,28 @@
 import os
 import random
-from libc.stdint cimport int32_t
+
 from cymem.cymem cimport Pool
+from libc.stdint cimport int32_t
 
 from collections import Counter
+
 from thinc.extra.search cimport Beam
 
 from ...tokens.doc cimport Doc
+
 from ...tokens.span import Span
-from ...tokens.span cimport Span
-from ...typedefs cimport weight_t, attr_t
-from ...lexeme cimport Lexeme
+
 from ...attrs cimport IS_SPACE
-from ...structs cimport TokenC, SpanC
+from ...lexeme cimport Lexeme
+from ...structs cimport SpanC, TokenC
+from ...tokens.span cimport Span
+from ...typedefs cimport attr_t, weight_t
+
 from ...training import split_bilu_label
+
 from ...training.example cimport Example
-from .stateclass cimport StateClass
 from ._state cimport StateC
+from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
 
 from ...errors import Errors
diff --git a/spacy/pipeline/_parser_internals/nonproj.pxd b/spacy/pipeline/_parser_internals/nonproj.pxd
index aabdf7ebe..1a349d56a 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pxd
+++ b/spacy/pipeline/_parser_internals/nonproj.pxd
@@ -1,4 +1,5 @@
 from libcpp.string cimport string
 
+
 cdef extern from "nonproj.hh":
     cdef void raise_domain_error(const string& msg) nogil except +
diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index d1b6e7066..66f423b3b 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -4,19 +4,20 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from copy import copy
-from cython.operator cimport preincrement as incr, dereference as deref
+
+from cython.operator cimport dereference as deref
+from cython.operator cimport preincrement as incr
 from libc.limits cimport INT_MAX
 from libc.stdlib cimport abs
 from libcpp cimport bool
 from libcpp.string cimport string, to_string
-from libcpp.vector cimport vector
 from libcpp.unordered_set cimport unordered_set
+from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc, set_children_from_heads
 
 from ...errors import Errors
 
-
 DELIMITER = '||'
 
 
diff --git a/spacy/pipeline/_parser_internals/stateclass.pxd b/spacy/pipeline/_parser_internals/stateclass.pxd
index 54ff344b9..b8ecc1bbf 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pxd
+++ b/spacy/pipeline/_parser_internals/stateclass.pxd
@@ -1,9 +1,8 @@
 from cymem.cymem cimport Pool
 
-from ...structs cimport TokenC, SpanC
-from ...typedefs cimport attr_t
+from ...structs cimport SpanC, TokenC
 from ...tokens.doc cimport Doc
-
+from ...typedefs cimport attr_t
 from ._state cimport StateC
 
 
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index 4eaddd997..0a2657af1 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -1,9 +1,10 @@
 # cython: infer_types=True
 import numpy
+
 from libcpp.vector cimport vector
-from ._state cimport ArcC
 
 from ...tokens.doc cimport Doc
+from ._state cimport ArcC
 
 
 cdef class StateClass:
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 52ebd2b8e..ce17480d4 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -1,11 +1,11 @@
 from cymem.cymem cimport Pool
 
-from ...typedefs cimport attr_t, weight_t
-from ...structs cimport TokenC
 from ...strings cimport StringStore
+from ...structs cimport TokenC
 from ...training.example cimport Example
-from .stateclass cimport StateClass
+from ...typedefs cimport attr_t, weight_t
 from ._state cimport StateC
+from .stateclass cimport StateClass
 
 
 cdef struct Transition:
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 18eb745a9..053c87f22 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -1,18 +1,20 @@
 # cython: infer_types=True
 from __future__ import print_function
+
 from cymem.cymem cimport Pool
 
 from collections import Counter
+
 import srsly
 
-from . cimport _beam_utils
-from ...typedefs cimport weight_t, attr_t
-from ...tokens.doc cimport Doc
 from ...structs cimport TokenC
+from ...tokens.doc cimport Doc
+from ...typedefs cimport attr_t, weight_t
+from . cimport _beam_utils
 from .stateclass cimport StateClass
 
-from ...errors import Errors
 from ... import util
+from ...errors import Errors
 
 
 cdef weight_t MIN_SCORE = -90000
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 0d9494865..8ac74d92b 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -1,21 +1,20 @@
-from typing import List, Dict, Union, Iterable, Any, Optional, Callable
-from typing import Tuple
-import srsly
 from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
-from .pipe import Pipe
+import srsly
+
+from .. import util
 from ..errors import Errors
-from ..training import Example
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
-from ..vocab import Vocab
+from ..training import Example
 from ..util import SimpleFrozenList, registry
-from .. import util
-
+from ..vocab import Vocab
+from .pipe import Pipe
 
 MatcherPatternType = List[Dict[Union[int, str], Any]]
 AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index e5f686158..cb896c385 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -1,20 +1,21 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable, Callable
-from thinc.api import Model, Config
+from typing import Callable, Iterable, Optional
+
+from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
-from ._parser_internals.arc_eager cimport ArcEager
 
-from .functions import merge_subtokens
+from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ._parser_internals import nonproj
-from ._parser_internals.nonproj import DELIMITER
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
 from ..util import registry
-
+from ._parser_internals import nonproj
+from ._parser_internals.nonproj import DELIMITER
+from .functions import merge_subtokens
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 332badd8c..4a6174bc3 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,24 +1,22 @@
-from typing import cast, Any, Callable, Dict, Iterable, List, Optional
-from typing import Tuple
 from collections import Counter
 from itertools import islice
-import numpy as np
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
 
+import numpy as np
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
+from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints2d
 
-from ._edit_tree_internals.edit_trees import EditTrees
-from ._edit_tree_internals.schemas import validate_edit_tree
-from .lemmatizer import lemmatizer_score
-from .trainable_pipe import TrainablePipe
+from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..tokens import Doc
 from ..training import Example, validate_examples, validate_get_examples
 from ..vocab import Vocab
-from .. import util
-
+from ._edit_tree_internals.edit_trees import EditTrees
+from ._edit_tree_internals.schemas import validate_edit_tree
+from .lemmatizer import lemmatizer_score
+from .trainable_pipe import TrainablePipe
 
 # The cutoff value of *top_k* above which an alternative method is used to process guesses.
 TOP_K_GUARDRAIL = 20
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 76ccc3247..a730ece1b 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,25 +1,25 @@
-from typing import Optional, Iterable, Callable, Dict, Union, List, Any
-from thinc.types import Floats2d
-from pathlib import Path
-from itertools import islice
-import srsly
 import random
-from thinc.api import CosineDistance, Model, Optimizer, Config
-from thinc.api import set_dropout_rate
+from itertools import islice
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+import srsly
+from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
 
-from ..kb import KnowledgeBase, Candidate
-from ..ml import empty_kb
-from ..tokens import Doc, Span
-from .pipe import deserialize_config
-from .legacy.entity_linker import EntityLinker_v1
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..vocab import Vocab
-from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
-from ..util import SimpleFrozenList, registry
 from .. import util
+from ..errors import Errors
+from ..kb import Candidate, KnowledgeBase
+from ..language import Language
+from ..ml import empty_kb
 from ..scorer import Scorer
+from ..tokens import Doc, Span
+from ..training import Example, validate_examples, validate_get_examples
+from ..util import SimpleFrozenList, registry
+from ..vocab import Vocab
+from .legacy.entity_linker import EntityLinker_v1
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
 
 # See #9050
 BACKWARD_OVERWRITE = True
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 6a3755533..3683cfc02 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,19 +1,19 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
 import warnings
 from collections import defaultdict
 from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
 from ..errors import Errors, Warnings
-from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
-from ..tokens import Doc, Span
+from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
 from ..scorer import get_ner_prf
-
+from ..tokens import Doc, Span
+from ..training import Example
+from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
+from .pipe import Pipe
 
 DEFAULT_ENT_ID_SEP = "||"
 PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index c005395bf..2bf0437d5 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,12 +1,13 @@
-from typing import Dict, Any
-import srsly
 import warnings
+from typing import Any, Dict
 
+import srsly
+
+from .. import util
 from ..errors import Warnings
 from ..language import Language
 from ..matcher import Matcher
 from ..tokens import Doc
-from .. import util
 
 
 @Language.component(
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
index c14dfa1db..1e46db019 100644
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@@ -1,28 +1,28 @@
 # This file is present to provide a prior version of the EntityLinker component
 # for backwards compatability. For details see #9669.
 
-from typing import Optional, Iterable, Callable, Dict, Union, List, Any
-from thinc.types import Floats2d
-from pathlib import Path
-from itertools import islice
-import srsly
 import random
-from thinc.api import CosineDistance, Model, Optimizer
-from thinc.api import set_dropout_rate
 import warnings
+from itertools import islice
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
-from ...kb import KnowledgeBase, Candidate
+import srsly
+from thinc.api import CosineDistance, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
+
+from ... import util
+from ...errors import Errors, Warnings
+from ...kb import Candidate, KnowledgeBase
+from ...language import Language
 from ...ml import empty_kb
+from ...scorer import Scorer
 from ...tokens import Doc, Span
+from ...training import Example, validate_examples, validate_get_examples
+from ...util import SimpleFrozenList
+from ...vocab import Vocab
 from ..pipe import deserialize_config
 from ..trainable_pipe import TrainablePipe
-from ...language import Language
-from ...vocab import Vocab
-from ...training import Example, validate_examples, validate_get_examples
-from ...errors import Errors, Warnings
-from ...util import SimpleFrozenList
-from ... import util
-from ...scorer import Scorer
 
 # See #9050
 BACKWARD_OVERWRITE = True
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 9c2fc2f09..09e501595 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -1,19 +1,19 @@
-from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
-from thinc.api import Model
-from pathlib import Path
-
 import warnings
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
-from .pipe import Pipe
+from thinc.api import Model
+
+from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
-from ..training import Example
 from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
+from ..training import Example
+from ..util import SimpleFrozenList, logger, registry
 from ..vocab import Vocab
-from ..util import logger, SimpleFrozenList, registry
-from .. import util
+from .pipe import Pipe
 
 
 @Language.factory(
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index be8f82212..4ca0ce165 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,23 +1,24 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict, Callable
-import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
+from typing import Callable, Dict, Optional, Union
 
+import srsly
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+
+from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
-from ..morphology cimport Morphology
 
-from ..parts_of_speech import IDS as POS_IDS
-from ..symbols import POS
-from ..language import Language
-from ..errors import Errors
-from .pipe import deserialize_config
-from .tagger import Tagger
 from .. import util
+from ..errors import Errors
+from ..language import Language
+from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
+from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
+from .pipe import deserialize_config
+from .tagger import Tagger
 
 # See #9050
 BACKWARD_OVERWRITE = True
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 8c44061e2..6b62c0811 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -1,19 +1,18 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Optional
+
 import numpy
-from thinc.api import CosineDistance, to_categorical, Model, Config
-from thinc.api import set_dropout_rate
+from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
 
 from ..tokens.doc cimport Doc
 
-from .trainable_pipe import TrainablePipe
-from .tagger import Tagger
-from ..training import validate_examples
-from ..language import Language
-from ._parser_internals import nonproj
-from ..attrs import POS, ID
+from ..attrs import ID, POS
 from ..errors import Errors
-
+from ..language import Language
+from ..training import validate_examples
+from ._parser_internals import nonproj
+from .tagger import Tagger
+from .trainable_pipe import TrainablePipe
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 25f48c9f8..8dd6c3c43 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -1,16 +1,18 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable, Callable
-from thinc.api import Model, Config
+from typing import Callable, Iterable, Optional
+
+from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
-from ._parser_internals.ner cimport BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
-from ..training import remove_bilu_prefix
 
+from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
+from ..language import Language
+from ..scorer import PRFScore, get_ner_prf
+from ..training import remove_bilu_prefix
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi
index 9dd6a9d50..9a1c11cef 100644
--- a/spacy/pipeline/pipe.pyi
+++ b/spacy/pipeline/pipe.pyi
@@ -1,11 +1,20 @@
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, Iterator, List
-from typing import NoReturn, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    NoReturn,
+    Optional,
+    Tuple,
+    Union,
+)
 
-from ..tokens.doc import Doc
-
-from ..training import Example
 from ..language import Language
+from ..tokens.doc import Doc
+from ..training import Example
 
 class Pipe:
     def __call__(self, doc: Doc) -> Doc: ...
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 8407acc45..42f518882 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,15 +1,17 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict
-import srsly
 import warnings
+from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
+
+import srsly
 
 from ..tokens.doc cimport Doc
 
-from ..training import Example
 from ..errors import Errors, Warnings
 from ..language import Language
+from ..training import Example
 from ..util import raise_error
 
+
 cdef class Pipe:
     """This class is a base class and not instantiated directly. It provides
     an interface for pipeline components to implement.
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 77f4e8adb..2fe7e1540 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,14 +1,15 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, List, Callable
+from typing import Callable, List, Optional
+
 import srsly
 
 from ..tokens.doc cimport Doc
 
-from .pipe import Pipe
-from .senter import senter_score
+from .. import util
 from ..language import Language
 from ..scorer import Scorer
-from .. import util
+from .pipe import Pipe
+from .senter import senter_score
 
 # see #9050
 BACKWARD_OVERWRITE = False
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 6808fe70e..26f98ba59 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,19 +1,19 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Callable
 from itertools import islice
+from typing import Callable, Optional
 
 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 
-from .tagger import Tagger
-from ..language import Language
+from .. import util
 from ..errors import Errors
+from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .. import util
+from .tagger import Tagger
 
 # See #9050
 BACKWARD_OVERWRITE = False
diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py
new file mode 100644
index 000000000..53f5c55be
--- /dev/null
+++ b/spacy/pipeline/span_finder.py
@@ -0,0 +1,335 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+from thinc.api import Config, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
+
+from ..errors import Errors
+from ..language import Language
+from ..scorer import Scorer
+from ..tokens import Doc, Span
+from ..training import Example
+from ..util import registry
+from .spancat import DEFAULT_SPANS_KEY
+from .trainable_pipe import TrainablePipe
+
+span_finder_default_config = """
+[model]
+@architectures = "spacy.SpanFinder.v1"
+
+[model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = 96
+rows = [5000, 1000, 2500, 1000]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+include_static_vectors = false
+
+[model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = ${model.tok2vec.embed.width}
+window_size = 1
+maxout_pieces = 3
+depth = 4
+"""
+
+DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
+
+
+@Language.factory(
+    "span_finder",
+    assigns=["doc.spans"],
+    default_config={
+        "threshold": 0.5,
+        "model": DEFAULT_SPAN_FINDER_MODEL,
+        "spans_key": DEFAULT_SPANS_KEY,
+        "max_length": None,
+        "min_length": None,
+        "scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
+    },
+    default_score_weights={
+        f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
+        f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
+        f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
+    },
+)
+def make_span_finder(
+    nlp: Language,
+    name: str,
+    model: Model[Iterable[Doc], Floats2d],
+    spans_key: str,
+    threshold: float,
+    max_length: Optional[int],
+    min_length: Optional[int],
+    scorer: Optional[Callable],
+) -> "SpanFinder":
+    """Create a SpanFinder component. The component predicts whether a token is
+    the start or the end of a potential span.
+
+    model (Model[List[Doc], Floats2d]): A model instance that
+        is given a list of documents and predicts a probability for each token.
+    spans_key (str): Key of the doc.spans dict to save the spans under. During
+        initialization and training, the component will look for spans on the
+        reference document under the same key.
+    threshold (float): Minimum probability to consider a prediction positive.
+    max_length (Optional[int]): Maximum length of the produced spans, defaults
+        to None meaning unlimited length.
+    min_length (Optional[int]): Minimum length of the produced spans, defaults
+        to None meaning shortest span length is 1.
+    scorer (Optional[Callable]): The scoring method. Defaults to
+        Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+        spans allowed.
+    """
+    return SpanFinder(
+        nlp,
+        model=model,
+        threshold=threshold,
+        name=name,
+        scorer=scorer,
+        max_length=max_length,
+        min_length=min_length,
+        spans_key=spans_key,
+    )
+
+
+@registry.scorers("spacy.span_finder_scorer.v1")
+def make_span_finder_scorer():
+    return span_finder_score
+
+
+def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    kwargs = dict(kwargs)
+    attr_prefix = "spans_"
+    key = kwargs["spans_key"]
+    kwargs.setdefault("attr", f"{attr_prefix}{key}")
+    kwargs.setdefault(
+        "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
+    )
+    kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
+    kwargs.setdefault("allow_overlap", True)
+    kwargs.setdefault("labeled", False)
+    scores = Scorer.score_spans(examples, **kwargs)
+    scores.pop(f"{kwargs['attr']}_per_type", None)
+    return scores
+
+
+def _char_indices(span: Span) -> Tuple[int, int]:
+    start = span[0].idx
+    end = span[-1].idx + len(span[-1])
+    return start, end
+
+
+class SpanFinder(TrainablePipe):
+    """Pipeline that learns span boundaries.
+
+    DOCS: https://spacy.io/api/spanfinder
+    """
+
+    def __init__(
+        self,
+        nlp: Language,
+        model: Model[Iterable[Doc], Floats2d],
+        name: str = "span_finder",
+        *,
+        spans_key: str = DEFAULT_SPANS_KEY,
+        threshold: float = 0.5,
+        max_length: Optional[int] = None,
+        min_length: Optional[int] = None,
+        scorer: Optional[Callable] = span_finder_score,
+    ) -> None:
+        """Initialize the span finder.
+        model (thinc.api.Model): The Thinc Model powering the pipeline
+            component.
+        name (str): The component instance name, used to add entries to the
+            losses during training.
+        threshold (float): Minimum probability to consider a prediction
+            positive.
+        scorer (Optional[Callable]): The scoring method.
+        spans_key (str): Key of the doc.spans dict to save the spans under.
+            During initialization and training, the component will look for
+            spans on the reference document under the same key.
+        max_length (Optional[int]): Maximum length of the produced spans,
+            defaults to None meaning unlimited length.
+        min_length (Optional[int]): Minimum length of the produced spans,
+            defaults to None meaning shortest span length is 1.
+
+        DOCS: https://spacy.io/api/spanfinder#init
+        """
+        self.vocab = nlp.vocab
+        if (max_length is not None and max_length < 1) or (
+            min_length is not None and min_length < 1
+        ):
+            raise ValueError(
+                Errors.E1053.format(min_length=min_length, max_length=max_length)
+            )
+        self.model = model
+        self.name = name
+        self.scorer = scorer
+        self.cfg: Dict[str, Any] = {
+            "min_length": min_length,
+            "max_length": max_length,
+            "threshold": threshold,
+            "spans_key": spans_key,
+        }
+
+    def predict(self, docs: Iterable[Doc]):
+        """Apply the pipeline's model to a batch of docs, without modifying
+        them.
+
+        docs (Iterable[Doc]): The documents to predict.
+        RETURNS: The models prediction for each document.
+
+        DOCS: https://spacy.io/api/spanfinder#predict
+        """
+        scores = self.model.predict(docs)
+        return scores
+
+    def set_annotations(self, docs: Iterable[Doc], scores: Floats2d) -> None:
+        """Modify a batch of Doc objects, using pre-computed scores.
+        docs (Iterable[Doc]): The documents to modify.
+        scores: The scores to set, produced by SpanFinder predict method.
+
+        DOCS: https://spacy.io/api/spanfinder#set_annotations
+        """
+        offset = 0
+        for i, doc in enumerate(docs):
+            doc.spans[self.cfg["spans_key"]] = []
+            starts = []
+            ends = []
+            doc_scores = scores[offset : offset + len(doc)]
+
+            for token, token_score in zip(doc, doc_scores):
+                if token_score[0] >= self.cfg["threshold"]:
+                    starts.append(token.i)
+                if token_score[1] >= self.cfg["threshold"]:
+                    ends.append(token.i)
+
+            for start in starts:
+                for end in ends:
+                    span_length = end + 1 - start
+                    if span_length < 1:
+                        continue
+                    if (
+                        self.cfg["min_length"] is None
+                        or self.cfg["min_length"] <= span_length
+                    ) and (
+                        self.cfg["max_length"] is None
+                        or span_length <= self.cfg["max_length"]
+                    ):
+                        doc.spans[self.cfg["spans_key"]].append(doc[start : end + 1])
+            offset += len(doc)
+
+    def update(
+        self,
+        examples: Iterable[Example],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ) -> Dict[str, float]:
+        """Learn from a batch of documents and gold-standard information,
+        updating the pipe's model. Delegates to predict and get_loss.
+        examples (Iterable[Example]): A batch of Example objects.
+        drop (float): The dropout rate.
+        sgd (Optional[thinc.api.Optimizer]): The optimizer.
+        losses (Optional[Dict[str, float]]): Optional record of the loss during
+            training. Updated using the component name as the key.
+        RETURNS (Dict[str, float]): The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/spanfinder#update
+        """
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        predicted = [eg.predicted for eg in examples]
+        set_dropout_rate(self.model, drop)
+        scores, backprop_scores = self.model.begin_update(predicted)
+        loss, d_scores = self.get_loss(examples, scores)
+        backprop_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
+    def get_loss(self, examples, scores) -> Tuple[float, Floats2d]:
+        """Find the loss and gradient of loss for the batch of documents and
+        their predicted scores.
+        examples (Iterable[Examples]): The batch of examples.
+        scores: Scores representing the model's predictions.
+        RETURNS (Tuple[float, Floats2d]): The loss and the gradient.
+
+        DOCS: https://spacy.io/api/spanfinder#get_loss
+        """
+        truths, masks = self._get_aligned_truth_scores(examples, self.model.ops)
+        d_scores = scores - self.model.ops.asarray2f(truths)
+        d_scores *= masks
+        loss = float((d_scores**2).sum())
+        return loss, d_scores
+
+    def _get_aligned_truth_scores(self, examples, ops) -> Tuple[Floats2d, Floats2d]:
+        """Align scores of the predictions to the references for calculating
+        the loss.
+        """
+        truths = []
+        masks = []
+        for eg in examples:
+            if eg.x.text != eg.y.text:
+                raise ValueError(Errors.E1054.format(component="span_finder"))
+            n_tokens = len(eg.predicted)
+            truth = ops.xp.zeros((n_tokens, 2), dtype="float32")
+            mask = ops.xp.ones((n_tokens, 2), dtype="float32")
+            if self.cfg["spans_key"] in eg.reference.spans:
+                for span in eg.reference.spans[self.cfg["spans_key"]]:
+                    ref_start_char, ref_end_char = _char_indices(span)
+                    pred_span = eg.predicted.char_span(
+                        ref_start_char, ref_end_char, alignment_mode="expand"
+                    )
+                    pred_start_char, pred_end_char = _char_indices(pred_span)
+                    start_match = pred_start_char == ref_start_char
+                    end_match = pred_end_char == ref_end_char
+                    if start_match:
+                        truth[pred_span[0].i, 0] = 1
+                    else:
+                        mask[pred_span[0].i, 0] = 0
+                    if end_match:
+                        truth[pred_span[-1].i, 1] = 1
+                    else:
+                        mask[pred_span[-1].i, 1] = 0
+            truths.append(truth)
+            masks.append(mask)
+        truths = ops.xp.concatenate(truths, axis=0)
+        masks = ops.xp.concatenate(masks, axis=0)
+        return truths, masks
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language] = None,
+    ) -> None:
+        """Initialize the pipe for training, using a representative set
+        of data examples.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Optional[Language]): The current nlp object the component is part
+            of.
+
+        DOCS: https://spacy.io/api/spanfinder#initialize
+        """
+        subbatch: List[Example] = []
+
+        for eg in get_examples():
+            if len(subbatch) < 10:
+                subbatch.append(eg)
+
+        if subbatch:
+            docs = [eg.reference for eg in subbatch]
+            Y, _ = self._get_aligned_truth_scores(subbatch, self.model.ops)
+            self.model.initialize(X=docs, Y=Y)
+        else:
+            self.model.initialize()
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index b0669c0ef..2a5e2179a 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -1,20 +1,32 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
-from typing import Sequence, Set, cast
 import warnings
 from functools import partial
 from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+    cast,
+)
+
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
+from .. import util
 from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer
+from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
-from .. import util
+from ..scorer import Scorer
+from ..tokens import Doc, Span
+from ..training import Example
+from ..util import SimpleFrozenList, ensure_path, registry
+from .pipe import Pipe
 
 PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
 DEFAULT_SPANS_KEY = "ruler"
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 983e1fba9..08a5478a9 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,21 +1,20 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
 from dataclasses import dataclass
-from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
-from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import numpy
+from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
+from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 
 from ..compat import Protocol, runtime_checkable
-from ..scorer import Scorer
-from ..language import Language
-from .trainable_pipe import TrainablePipe
-from ..tokens import Doc, SpanGroup, Span
-from ..vocab import Vocab
-from ..training import Example, validate_examples
 from ..errors import Errors
+from ..language import Language
+from ..scorer import Scorer
+from ..tokens import Doc, Span, SpanGroup
+from ..training import Example, validate_examples
 from ..util import registry
-
+from ..vocab import Vocab
+from .trainable_pipe import TrainablePipe
 
 spancat_default_config = """
 [model]
@@ -32,8 +31,8 @@ hidden_size = 128
 [model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v2"
 width = 96
-rows = [5000, 2000, 1000, 1000]
-attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 1000, 2500, 1000]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false
 
 [model.tok2vec.encode]
@@ -70,6 +69,7 @@ maxout_pieces = 3
 depth = 4
 """
 
+DEFAULT_SPANS_KEY = "sc"
 DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
 DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
     spancat_singlelabel_default_config
@@ -82,39 +82,65 @@ class Suggester(Protocol):
         ...
 
 
+def ngram_suggester(
+    docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
+) -> Ragged:
+    if ops is None:
+        ops = get_current_ops()
+    spans = []
+    lengths = []
+    for doc in docs:
+        starts = ops.xp.arange(len(doc), dtype="i")
+        starts = starts.reshape((-1, 1))
+        length = 0
+        for size in sizes:
+            if size <= len(doc):
+                starts_size = starts[: len(doc) - (size - 1)]
+                spans.append(ops.xp.hstack((starts_size, starts_size + size)))
+                length += spans[-1].shape[0]
+            if spans:
+                assert spans[-1].ndim == 2, spans[-1].shape
+        lengths.append(length)
+    lengths_array = ops.asarray1i(lengths)
+    if len(spans) > 0:
+        output = Ragged(ops.xp.vstack(spans), lengths_array)
+    else:
+        output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+
+    assert output.dataXd.ndim == 2
+    return output
+
+
+def preset_spans_suggester(
+    docs: Iterable[Doc], spans_key: str, *, ops: Optional[Ops] = None
+) -> Ragged:
+    if ops is None:
+        ops = get_current_ops()
+    spans = []
+    lengths = []
+    for doc in docs:
+        length = 0
+        if doc.spans[spans_key]:
+            for span in doc.spans[spans_key]:
+                spans.append([span.start, span.end])
+                length += 1
+
+        lengths.append(length)
+    lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i"))
+    if len(spans) > 0:
+        output = Ragged(ops.asarray(spans, dtype="i"), lengths_array)
+    else:
+        output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+    return output
+
+
 @registry.misc("spacy.ngram_suggester.v1")
 def build_ngram_suggester(sizes: List[int]) -> Suggester:
     """Suggest all spans of the given lengths. Spans are returned as a ragged
     array of integers. The array has two columns, indicating the start and end
     position."""
 
-    def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
-        if ops is None:
-            ops = get_current_ops()
-        spans = []
-        lengths = []
-        for doc in docs:
-            starts = ops.xp.arange(len(doc), dtype="i")
-            starts = starts.reshape((-1, 1))
-            length = 0
-            for size in sizes:
-                if size <= len(doc):
-                    starts_size = starts[: len(doc) - (size - 1)]
-                    spans.append(ops.xp.hstack((starts_size, starts_size + size)))
-                    length += spans[-1].shape[0]
-                if spans:
-                    assert spans[-1].ndim == 2, spans[-1].shape
-            lengths.append(length)
-        lengths_array = ops.asarray1i(lengths)
-        if len(spans) > 0:
-            output = Ragged(ops.xp.vstack(spans), lengths_array)
-        else:
-            output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
-
-        assert output.dataXd.ndim == 2
-        return output
-
-    return ngram_suggester
+    return partial(ngram_suggester, sizes=sizes)
 
 
 @registry.misc("spacy.ngram_range_suggester.v1")
@@ -126,12 +152,20 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
     return build_ngram_suggester(sizes)
 
 
+@registry.misc("spacy.preset_spans_suggester.v1")
+def build_preset_spans_suggester(spans_key: str) -> Suggester:
+    """Suggest all spans that are already stored in doc.spans[spans_key].
+    This is useful when an upstream component is used to set the spans
+    on the Doc such as a SpanRuler or SpanFinder."""
+    return partial(preset_spans_suggester, spans_key=spans_key)
+
+
 @Language.factory(
     "spancat",
     assigns=["doc.spans"],
     default_config={
         "threshold": 0.5,
-        "spans_key": "sc",
+        "spans_key": DEFAULT_SPANS_KEY,
         "max_positive": None,
         "model": DEFAULT_SPANCAT_MODEL,
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
@@ -195,7 +229,7 @@ def make_spancat(
     "spancat_singlelabel",
     assigns=["doc.spans"],
     default_config={
-        "spans_key": "sc",
+        "spans_key": DEFAULT_SPANS_KEY,
         "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
         "negative_weight": 1.0,
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
@@ -726,6 +760,7 @@ class SpanCategorizer(TrainablePipe):
         if not allow_overlap:
             # Get the probabilities
             sort_idx = (argmax_scores.squeeze() * -1).argsort()
+            argmax_scores = argmax_scores[sort_idx]
             predicted = predicted[sort_idx]
             indices = indices[sort_idx]
             keeps = keeps[sort_idx]
@@ -748,4 +783,5 @@ class SpanCategorizer(TrainablePipe):
             attrs_scores.append(argmax_scores[i])
             spans.append(Span(doc, start, end, label=self.labels[label]))
 
+        spans.attrs["scores"] = numpy.array(attrs_scores)
         return spans
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 4d5d78035..47aae2bb7 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,26 +1,27 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Callable, Optional
-import numpy
-import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
-from thinc.types import Floats2d
 import warnings
 from itertools import islice
+from typing import Callable, Optional
+
+import numpy
+import srsly
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
+from thinc.types import Floats2d
 
-from ..tokens.doc cimport Doc
 from ..morphology cimport Morphology
+from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
-from .trainable_pipe import TrainablePipe
-from .pipe import deserialize_config
-from ..language import Language
-from ..attrs import POS, ID
-from ..parts_of_speech import X
+from .. import util
+from ..attrs import ID, POS
 from ..errors import Errors, Warnings
+from ..language import Language
+from ..parts_of_speech import X
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .. import util
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
 
 # See #9050
 BACKWARD_OVERWRITE = False
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 650a01949..610ed99b6 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,18 +1,18 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+import numpy
+from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate
+from thinc.types import Floats2d
 
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
+from ..language import Language
 from ..scorer import Scorer
 from ..tokens import Doc
+from ..training import Example, validate_examples, validate_get_examples
 from ..util import registry
 from ..vocab import Vocab
-
+from .trainable_pipe import TrainablePipe
 
 single_label_default_config = """
 [model]
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 41c0e2f63..364e6f436 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,19 +1,18 @@
-from typing import Iterable, Optional, Dict, List, Callable, Any
-from thinc.types import Floats2d
-from thinc.api import Model, Config
-
 from itertools import islice
+from typing import Any, Callable, Dict, Iterable, List, Optional
+
+from thinc.api import Config, Model
+from thinc.types import Floats2d
 
-from ..language import Language
-from ..training import Example, validate_get_examples
 from ..errors import Errors
+from ..language import Language
 from ..scorer import Scorer
 from ..tokens import Doc
+from ..training import Example, validate_get_examples
 from ..util import registry
 from ..vocab import Vocab
 from .textcat import TextCategorizer
 
-
 multi_label_default_config = """
 [model]
 @architectures = "spacy.TextCatEnsemble.v2"
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index c742aaeaa..677f5eec1 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,13 +1,14 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any
-from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+
+from thinc.api import Config, Model, Optimizer, set_dropout_rate
 
-from .trainable_pipe import TrainablePipe
-from ..training import Example, validate_examples, validate_get_examples
-from ..tokens import Doc
-from ..vocab import Vocab
-from ..language import Language
 from ..errors import Errors
+from ..language import Language
+from ..tokens import Doc
+from ..training import Example, validate_examples, validate_get_examples
+from ..vocab import Vocab
+from .trainable_pipe import TrainablePipe
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index 65daa8b22..b1d2550a1 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -1,5 +1,6 @@
-from .pipe cimport Pipe
 from ..vocab cimport Vocab
+from .pipe cimport Pipe
+
 
 cdef class TrainablePipe(Pipe):
     cdef public Vocab vocab
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 3f0507d4b..7aa91ac16 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,17 +1,17 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable
+from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
+
 import srsly
-from thinc.api import set_dropout_rate, Model, Optimizer
+from thinc.api import Model, Optimizer, set_dropout_rate
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples
-from ..errors import Errors
-from .pipe import Pipe, deserialize_config
 from .. import util
-from ..vocab import Vocab
+from ..errors import Errors
 from ..language import Language
-from ..training import Example
+from ..training import Example, validate_examples
+from ..vocab import Vocab
+from .pipe import Pipe, deserialize_config
 
 
 cdef class TrainablePipe(Pipe):
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index 1521fde60..e5e88d521 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -1,11 +1,11 @@
 from cymem.cymem cimport Pool
 from thinc.backends.cblas cimport CBlas
 
+from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
 from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
 from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 
 cdef class Parser(TrainablePipe):
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 1327db2ce..ef4d9b362 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,34 +1,50 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 from __future__ import print_function
-from cymem.cymem cimport Pool
+
 cimport numpy as np
+from cymem.cymem cimport Pool
+
 from itertools import islice
-from libcpp.vector cimport vector
-from libc.string cimport memset, memcpy
+
 from libc.stdlib cimport calloc, free
+from libc.string cimport memcpy, memset
+from libcpp.vector cimport vector
+
 import random
 
 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+from thinc.api import CupyOps, NumpyOps, get_ops, set_dropout_rate
+
 from thinc.extra.search cimport Beam
-import numpy.random
-import numpy
+
 import warnings
 
-from ._parser_internals.stateclass cimport StateClass
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
+import numpy
+import numpy.random
+
+from ..ml.parser_model cimport (
+    ActivationsC,
+    SizesC,
+    WeightsC,
+    alloc_activations,
+    arg_max_if_valid,
+    cpu_log_loss,
+    free_activations,
+    get_c_sizes,
+    get_c_weights,
+    predict_states,
+)
 from ..tokens.doc cimport Doc
+from ._parser_internals.stateclass cimport StateClass
+
 from .trainable_pipe import TrainablePipe
+
 from ._parser_internals cimport _beam_utils
-from ._parser_internals import _beam_utils
 
-from ..training import validate_examples, validate_get_examples
-from ..errors import Errors, Warnings
 from .. import util
-
+from ..errors import Errors, Warnings
+from ..training import validate_examples, validate_get_examples
+from ._parser_internals import _beam_utils
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 756d5ef3a..22c25e99d 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,17 +1,39 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, TYPE_CHECKING
-from .compat import Literal
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
+from collections import defaultdict
+from enum import Enum
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+from pydantic import (
+    BaseModel,
+    ConstrainedStr,
+    Field,
+    StrictBool,
+    StrictFloat,
+    StrictInt,
+    StrictStr,
+    ValidationError,
+    create_model,
+    validator,
+)
+from pydantic.main import ModelMetaclass
+from thinc.api import ConfigValidationError, Model, Optimizer
+from thinc.config import Promise
 
 from .attrs import NAMES
+from .compat import Literal
 from .lookups import Lookups
 from .util import is_cython_func
 
diff --git a/spacy/scorer.py b/spacy/scorer.py
index de4f52be6..48d9f03ab 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,13 +1,23 @@
-from typing import Optional, Iterable, Dict, Set, List, Any, Callable, Tuple
-from typing import TYPE_CHECKING
-import numpy as np
 from collections import defaultdict
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+)
+
+import numpy as np
 
-from .training import Example
-from .tokens import Token, Doc, Span
 from .errors import Errors
-from .util import get_lang_class, SimpleFrozenList
 from .morphology import Morphology
+from .tokens import Doc, Span, Token
+from .training import Example
+from .util import SimpleFrozenList, get_lang_class
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
@@ -121,20 +131,30 @@ class Scorer:
                 nlp.add_pipe(pipe)
             self.nlp = nlp
 
-    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
+    def score(
+        self, examples: Iterable[Example], *, per_component: bool = False
+    ) -> Dict[str, Any]:
         """Evaluate a list of Examples.
 
         examples (Iterable[Example]): The predicted annotations + correct annotations.
+        per_component (bool): Whether to return the scores keyed by component
+            name. Defaults to False.
         RETURNS (Dict): A dictionary of scores.
 
         DOCS: https://spacy.io/api/scorer#score
         """
         scores = {}
         if hasattr(self.nlp.tokenizer, "score"):
-            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))  # type: ignore
+            if per_component:
+                scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
+            else:
+                scores.update(self.nlp.tokenizer.score(examples, **self.cfg))  # type: ignore
         for name, component in self.nlp.pipeline:
             if hasattr(component, "score"):
-                scores.update(component.score(examples, **self.cfg))
+                if per_component:
+                    scores[name] = component.score(examples, **self.cfg)
+                else:
+                    scores.update(component.score(examples, **self.cfg))
         return scores
 
     @staticmethod
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 5f03a9a28..d22f48ba1 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,9 +1,9 @@
-from libc.stdint cimport int64_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
+from libc.stdint cimport int64_t
+from libcpp.set cimport set
+from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
+from preshed.maps cimport PreshMap
 
 from .typedefs cimport attr_t, hash_t
 
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index b29389b9a..f8fe8381c 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,5 +1,5 @@
-from typing import Optional, Iterable, Iterator, Union, Any, overload
 from pathlib import Path
+from typing import Any, Iterable, Iterator, Optional, Union, overload
 
 def get_string_id(key: Union[str, int]) -> int: ...
 
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index c5f218342..16c3e2b5b 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,18 +1,19 @@
 # cython: infer_types=True
 cimport cython
+from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from libcpp.set cimport set
-from libc.stdint cimport uint32_t
-from murmurhash.mrmr cimport hash64, hash32
+from murmurhash.mrmr cimport hash32, hash64
 
 import srsly
 
 from .typedefs cimport hash_t
 
+from . import util
+from .errors import Errors
 from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
-from .errors import Errors
-from . import util
+
 
 # Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
 cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 86d5b67ed..9efb068fd 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -1,11 +1,10 @@
-from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
-from libcpp.vector cimport vector
-from libcpp.unordered_set cimport unordered_set
+from libc.stdint cimport int32_t, int64_t, uint8_t, uint32_t, uint64_t
 from libcpp.unordered_map cimport unordered_map
-from libc.stdint cimport int32_t, int64_t
+from libcpp.unordered_set cimport unordered_set
+from libcpp.vector cimport vector
 
-from .typedefs cimport flags_t, attr_t, hash_t
 from .parts_of_speech cimport univ_pos_t
+from .typedefs cimport attr_t, flags_t, hash_t
 
 
 cdef struct LexemeC:
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 3a5c8e451..4ca741dfc 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,7 +1,8 @@
 import pytest
-from spacy.util import get_lang_class
 from hypothesis import settings
 
+from spacy.util import get_lang_class
+
 # Functionally disable deadline settings for tests
 # to prevent spurious test failures in CI builds.
 settings.register_profile("no_deadlines", deadline=2 * 60 * 1000)  # in ms
@@ -291,6 +292,11 @@ def ml_tokenizer():
     return get_lang_class("ml")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ms_tokenizer():
+    return get_lang_class("ms")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def nb_tokenizer():
     return get_lang_class("nb")().tokenizer
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 231b7c2a8..259b21fb3 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -1,10 +1,11 @@
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
-from spacy.training import Example
-from spacy.pipeline import EntityRecognizer
-from spacy.tokens import Span, Doc
-from spacy import registry
 import pytest
 
+from spacy import registry
+from spacy.pipeline import EntityRecognizer
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
+from spacy.tokens import Doc, Span
+from spacy.training import Example
+
 
 def _ner_example(ner):
     doc = Doc(
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index 1f2d7d999..757655f55 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -1,8 +1,8 @@
 import numpy
 import pytest
 
+from spacy.attrs import DEP, MORPH, ORTH, POS, SHAPE
 from spacy.tokens import Doc
-from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 
 
 @pytest.mark.issue(2203)
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index 302a9b6ea..4bc1de3e0 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -1,7 +1,8 @@
 import pytest
-from spacy.vocab import Vocab
-from spacy.tokens import Doc
+
 from spacy import util
+from spacy.tokens import Doc
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 38003dea9..73544c51a 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1,13 +1,22 @@
+import warnings
 import weakref
 
 import numpy
-from numpy.testing import assert_array_equal
 import pytest
-import warnings
+from numpy.testing import assert_array_equal
 from thinc.api import NumpyOps, get_current_ops
 
-from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
-from spacy.attrs import SENT_START, TAG
+from spacy.attrs import (
+    DEP,
+    ENT_IOB,
+    ENT_TYPE,
+    HEAD,
+    IS_ALPHA,
+    MORPH,
+    POS,
+    SENT_START,
+    TAG,
+)
 from spacy.lang.en import English
 from spacy.lang.xx import MultiLanguage
 from spacy.language import Language
diff --git a/spacy/tests/doc/test_graph.py b/spacy/tests/doc/test_graph.py
index e464b0058..d14a5b057 100644
--- a/spacy/tests/doc/test_graph.py
+++ b/spacy/tests/doc/test_graph.py
@@ -1,6 +1,6 @@
-from spacy.vocab import Vocab
 from spacy.tokens.doc import Doc
 from spacy.tokens.graph import Graph
+from spacy.vocab import Vocab
 
 
 def test_graph_init():
diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py
index 11a1817e6..a76472d07 100644
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@@ -1,8 +1,10 @@
 import pytest
+import srsly
+
 import spacy
 from spacy import schemas
 from spacy.tokens import Doc, Span, Token
-import srsly
+
 from .test_underscore import clean_underscore  # noqa: F401
 
 
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 918d4acdc..49e32b936 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
 def test_morph_props(i_has):
     assert i_has[0].morph.get("PronType") == ["prs"]
     assert i_has[1].morph.get("PronType") == []
+    assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
+    assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
 
 
 def test_morph_iter(i_has):
diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py
index 28cb66714..2e28162d4 100644
--- a/spacy/tests/doc/test_pickle_doc.py
+++ b/spacy/tests/doc/test_pickle_doc.py
@@ -1,5 +1,5 @@
-from spacy.language import Language
 from spacy.compat import pickle
+from spacy.language import Language
 
 
 def test_pickle_single_doc():
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index 20c302da1..45d54346e 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -1,7 +1,8 @@
 import pytest
+
 from spacy.attrs import LEMMA
-from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
+from spacy.vocab import Vocab
 
 
 def test_doc_retokenize_merge(en_tokenizer):
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index ec4deb033..61ef599be 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -1,8 +1,8 @@
 import numpy
 import pytest
 
-from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
+from spacy.vocab import Vocab
 
 
 @pytest.mark.issue(3540)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index adef5922f..04dde2bfa 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -1,13 +1,13 @@
-import pytest
 import numpy
+import pytest
 from numpy.testing import assert_array_equal
+from thinc.api import get_current_ops
 
-from spacy.attrs import ORTH, LENGTH
+from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, Token
-from spacy.vocab import Vocab
 from spacy.util import filter_spans
-from thinc.api import get_current_ops
+from spacy.vocab import Vocab
 
 from ..util import add_vecs_to_vocab
 from .test_underscore import clean_underscore  # noqa: F401
@@ -716,3 +716,18 @@ def test_for_partial_ent_sents():
     # equal to the sentences referenced in ent.sents.
     for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
         assert doc_sent == ent_sent
+
+
+def test_for_no_ent_sents():
+    """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
+    sentence.
+    """
+    doc = Doc(
+        English().vocab,
+        words=["This", "is", "a", "test.", "ENTITY"],
+        sent_starts=[1, 0, 0, 0, 1],
+    )
+    doc.set_ents([Span(doc, 4, 5, "WORK")])
+    sents = list(doc.ents[0].sents)
+    assert len(sents) == 1
+    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
diff --git a/spacy/tests/doc/test_span_group.py b/spacy/tests/doc/test_span_group.py
index 818569c64..ef78172bf 100644
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@@ -1,9 +1,10 @@
+from random import Random
 from typing import List
 
 import pytest
-from random import Random
+
 from spacy.matcher import Matcher
-from spacy.tokens import Span, SpanGroup, Doc
+from spacy.tokens import Doc, Span, SpanGroup
 from spacy.util import filter_spans
 
 
@@ -93,6 +94,21 @@ def test_span_group_copy(doc):
     assert span_group.attrs["key"] == "value"
     assert list(span_group) != list(clone)
 
+    # can't copy if the character offsets don't align to tokens
+    doc2 = Doc(doc.vocab, words=[t.text + "x" for t in doc])
+    with pytest.raises(ValueError):
+        span_group.copy(doc=doc2)
+
+    # can copy with valid character offsets despite different tokenization
+    doc3 = doc.copy()
+    with doc3.retokenize() as retokenizer:
+        retokenizer.merge(doc3[0:2])
+        retokenizer.merge(doc3[3:6])
+    span_group = SpanGroup(doc, spans=[doc[0:6], doc[3:6]])
+    for span1, span2 in zip(span_group, span_group.copy(doc=doc3)):
+        assert span1.start_char == span2.start_char
+        assert span1.end_char == span2.end_char
+
 
 def test_span_group_set_item(doc, other_doc):
     span_group = doc.spans["SPANS"]
@@ -253,3 +269,12 @@ def test_span_group_typing(doc: Doc):
     for i, span in enumerate(span_group):
         assert span == span_group[i] == spans[i]
     filter_spans(span_group)
+
+
+def test_span_group_init_doc(en_tokenizer):
+    """Test that all spans must come from the specified doc."""
+    doc1 = en_tokenizer("a b c")
+    doc2 = en_tokenizer("a b c")
+    span_group = SpanGroup(doc1, spans=[doc1[0:1], doc1[1:2]])
+    with pytest.raises(ValueError):
+        span_group = SpanGroup(doc1, spans=[doc1[0:1], doc2[1:2]])
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index e715c5e85..782dfd774 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -1,10 +1,11 @@
-import pytest
 import numpy
-from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
+import pytest
+
+from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_STOP, IS_TITLE
 from spacy.symbols import VERB
-from spacy.vocab import Vocab
 from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index b934221af..b79d2f01f 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -1,5 +1,6 @@
 import pytest
 from mock import Mock
+
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
diff --git a/spacy/tests/lang/bn/test_tokenizer.py b/spacy/tests/lang/bn/test_tokenizer.py
index 5b18c5269..e9a4d5e54 100644
--- a/spacy/tests/lang/bn/test_tokenizer.py
+++ b/spacy/tests/lang/bn/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 # fmt: off
 TESTCASES = [
     # Punctuation tests
diff --git a/spacy/tests/lang/da/test_noun_chunks.py b/spacy/tests/lang/da/test_noun_chunks.py
index 30df92c0b..b4d389e4b 100644
--- a/spacy/tests/lang/da/test_noun_chunks.py
+++ b/spacy/tests/lang/da/test_noun_chunks.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc
 
 
diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py
index 3c6cca5ac..e1f3b96e2 100644
--- a/spacy/tests/lang/da/test_text.py
+++ b/spacy/tests/lang/da/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.da.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py
index f5302cb31..8251306a6 100644
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@@ -1,9 +1,10 @@
-import pytest
 import re
+
+import pytest
+
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
-from spacy.util import compile_prefix_regex, compile_suffix_regex
-from spacy.util import compile_infix_regex
+from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex
 
 
 @pytest.fixture
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 0c54ffbb4..bda203b2c 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -1,6 +1,7 @@
-from spacy.tokens import Doc
 import pytest
 
+from spacy.tokens import Doc
+
 
 @pytest.fixture
 def doc(en_vocab):
diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py
index 1d10478a1..79d03d2db 100644
--- a/spacy/tests/lang/en/test_punct.py
+++ b/spacy/tests/lang/en/test_punct.py
@@ -1,7 +1,7 @@
 import pytest
-from spacy.util import compile_prefix_regex
-from spacy.lang.punctuation import TOKENIZER_PREFIXES
 
+from spacy.lang.punctuation import TOKENIZER_PREFIXES
+from spacy.util import compile_prefix_regex
 
 PUNCT_OPEN = ["(", "[", "{", "*"]
 PUNCT_CLOSE = [")", "]", "}", "*"]
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index d30c72750..c07c23193 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc
 
 from ...util import apply_transition_sequence
diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py
index 358f4c0f9..53cf0cc5b 100644
--- a/spacy/tests/lang/en/test_text.py
+++ b/spacy/tests/lang/en/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.en.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index 6118a0458..8e5fe8354 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -1,6 +1,7 @@
-from spacy.tokens import Doc
 import pytest
 
+from spacy.tokens import Doc
+
 
 # fmt: off
 @pytest.mark.parametrize(
diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py
index d95f6d26b..1d1f7fa6b 100644
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@@ -1,6 +1,7 @@
 import pytest
-from spacy.lang.es.lex_attrs import like_num
+
 from spacy.lang.es import Spanish
+from spacy.lang.es.lex_attrs import like_num
 
 
 @pytest.mark.issue(3803)
diff --git a/spacy/tests/lang/fi/test_noun_chunks.py b/spacy/tests/lang/fi/test_noun_chunks.py
index cab84b311..37e1b00a0 100644
--- a/spacy/tests/lang/fi/test_noun_chunks.py
+++ b/spacy/tests/lang/fi/test_noun_chunks.py
@@ -1,6 +1,6 @@
 import pytest
-from spacy.tokens import Doc
 
+from spacy.tokens import Doc
 
 FI_NP_TEST_EXAMPLES = [
     (
diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py
index dc40e18a3..2d9f081a7 100644
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 ABBREVIATION_TESTS = [
     (
         "Hyvää uutta vuotta t. siht. Niemelä!",
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 25b95f566..436e07b29 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,6 +1,7 @@
-from spacy.tokens import Doc
 import pytest
 
+from spacy.tokens import Doc
+
 
 # fmt: off
 @pytest.mark.parametrize(
diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
index 272531b63..b81ccbc0e 100644
--- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
@@ -1,7 +1,8 @@
 import pytest
-from spacy.language import Language, BaseDefaults
-from spacy.lang.punctuation import TOKENIZER_INFIXES
+
 from spacy.lang.char_classes import ALPHA
+from spacy.lang.punctuation import TOKENIZER_INFIXES
+from spacy.language import BaseDefaults, Language
 
 
 @pytest.mark.issue(768)
diff --git a/spacy/tests/lang/fr/test_text.py b/spacy/tests/lang/fr/test_text.py
index 01231f593..2c58a1c4a 100644
--- a/spacy/tests/lang/fr/test_text.py
+++ b/spacy/tests/lang/fr/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.fr.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index 78127ef7c..0c16b27d2 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 # fmt: off
 GA_TOKEN_EXCEPTION_TESTS = [
     ("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]),
diff --git a/spacy/tests/lang/grc/test_tokenizer.py b/spacy/tests/lang/grc/test_tokenizer.py
index 3df5b546b..9f29b9024 100644
--- a/spacy/tests/lang/grc/test_tokenizer.py
+++ b/spacy/tests/lang/grc/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 # fmt: off
 GRC_TOKEN_EXCEPTION_TESTS = [
     ("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]),
diff --git a/spacy/tests/lang/he/test_tokenizer.py b/spacy/tests/lang/he/test_tokenizer.py
index 3716f7e3b..15d059328 100644
--- a/spacy/tests/lang/he/test_tokenizer.py
+++ b/spacy/tests/lang/he/test_tokenizer.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.he.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/hi/test_lex_attrs.py b/spacy/tests/lang/hi/test_lex_attrs.py
index 80a7cc1c4..2d8d4a53e 100644
--- a/spacy/tests/lang/hi/test_lex_attrs.py
+++ b/spacy/tests/lang/hi/test_lex_attrs.py
@@ -1,5 +1,6 @@
 import pytest
-from spacy.lang.hi.lex_attrs import norm, like_num
+
+from spacy.lang.hi.lex_attrs import like_num, norm
 
 
 def test_hi_tokenizer_handles_long_text(hi_tokenizer):
diff --git a/spacy/tests/lang/hi/test_text.py b/spacy/tests/lang/hi/test_text.py
index 791cc3822..837dc3099 100644
--- a/spacy/tests/lang/hi/test_text.py
+++ b/spacy/tests/lang/hi/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.hi import Hindi
 
 
diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py
index 0488474ae..fa689c8f3 100644
--- a/spacy/tests/lang/hu/test_tokenizer.py
+++ b/spacy/tests/lang/hu/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 DEFAULT_TESTS = [
     ("N. kormányzósági\nszékhely.", ["N.", "kormányzósági", "székhely", "."]),
     pytest.param(
diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py
index ac0f1e128..7a69c2a81 100644
--- a/spacy/tests/lang/hy/test_text.py
+++ b/spacy/tests/lang/hy/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.hy.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py
index e9efb224a..9423cb4d0 100644
--- a/spacy/tests/lang/hy/test_tokenizer.py
+++ b/spacy/tests/lang/hy/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 # TODO add test cases with valid punctuation signs.
 
 hy_tokenize_text_test = [
diff --git a/spacy/tests/lang/id/test_text.py b/spacy/tests/lang/id/test_text.py
index ed6487b68..7397a8c17 100644
--- a/spacy/tests/lang/id/test_text.py
+++ b/spacy/tests/lang/id/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.id.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py
index 0a8c10e79..7f6659ee7 100644
--- a/spacy/tests/lang/it/test_noun_chunks.py
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@@ -1,6 +1,7 @@
-from spacy.tokens import Doc
 import pytest
 
+from spacy.tokens import Doc
+
 
 # fmt: off
 @pytest.mark.parametrize(
diff --git a/spacy/tests/lang/ja/test_morphologizer_factory.py b/spacy/tests/lang/ja/test_morphologizer_factory.py
index a4e038d01..d504576d0 100644
--- a/spacy/tests/lang/ja/test_morphologizer_factory.py
+++ b/spacy/tests/lang/ja/test_morphologizer_factory.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.ja import Japanese
 
 
diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py
index 011eb470f..f48b2570e 100644
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@@ -1,6 +1,7 @@
 import pickle
 
 from spacy.lang.ja import Japanese
+
 from ...util import make_tempdir
 
 
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index ef7bed06d..a26347444 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -1,7 +1,8 @@
 import pytest
 
+from spacy.lang.ja import DetailedToken, Japanese
+
 from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
-from spacy.lang.ja import Japanese, DetailedToken
 
 # fmt: off
 TOKENIZER_TESTS = [
diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py
index 75288fcc5..bba7bce6e 100644
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@@ -1,6 +1,7 @@
 import pickle
 
 from spacy.lang.ko import Korean
+
 from ...util import make_tempdir
 
 
diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py
index 5cf6eb1a6..b089dd9b9 100644
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 INFIX_HYPHEN_TESTS = [
     ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()),
     ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()),
diff --git a/spacy/tests/lang/la/test_noun_chunks.py b/spacy/tests/lang/la/test_noun_chunks.py
new file mode 100644
index 000000000..70a3392cd
--- /dev/null
+++ b/spacy/tests/lang/la/test_noun_chunks.py
@@ -0,0 +1,53 @@
+import pytest
+
+from spacy.tokens import Doc
+
+
+def test_noun_chunks_is_parsed(la_tokenizer):
+    """Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed.
+    To check this test, we're constructing a Doc
+    with a new Vocab here and forcing is_parsed to 'False'
+    to make sure the noun chunks don't run.
+    """
+    doc = la_tokenizer("Haec est sententia.")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
+
+
+LA_NP_TEST_EXAMPLES = [
+    (
+        "Haec narrantur a poetis de Perseo.",
+        ["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"],
+        ["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"],
+        [1, 0, -1, -1, -3, -1, -5],
+        ["poetis", "Perseo"],
+    ),
+    (
+        "Perseus autem in sinu matris dormiebat.",
+        ["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"],
+        ["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"],
+        [5, 4, 3, -1, -1, 0, -1],
+        ["Perseus", "sinu matris"],
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES
+)
+def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks):
+    tokens = la_tokenizer(text)
+
+    assert len(heads) == len(pos)
+    doc = Doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=[head + i for i, head in enumerate(heads)],
+        deps=deps,
+        pos=pos,
+    )
+
+    noun_chunks = list(doc.noun_chunks)
+    assert len(noun_chunks) == len(expected_noun_chunks)
+    for i, np in enumerate(noun_chunks):
+        assert np.text == expected_noun_chunks[i]
diff --git a/spacy/tests/lang/la/test_text.py b/spacy/tests/lang/la/test_text.py
index 48e7359a4..74606c4e8 100644
--- a/spacy/tests/lang/la/test_text.py
+++ b/spacy/tests/lang/la/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.la.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/mk/test_text.py b/spacy/tests/lang/mk/test_text.py
index b8881082c..b3a7ff9ee 100644
--- a/spacy/tests/lang/mk/test_text.py
+++ b/spacy/tests/lang/mk/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.mk.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/ms/__init__.py b/spacy/tests/lang/ms/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ms/test_noun_chunks.py b/spacy/tests/lang/ms/test_noun_chunks.py
new file mode 100644
index 000000000..859307d00
--- /dev/null
+++ b/spacy/tests/lang/ms/test_noun_chunks.py
@@ -0,0 +1,8 @@
+import pytest
+
+
+def test_noun_chunks_is_parsed_ms(ms_tokenizer):
+    """Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed."""
+    doc = ms_tokenizer("sebelas")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
diff --git a/spacy/tests/lang/ms/test_prefix_suffix_infix.py b/spacy/tests/lang/ms/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..0d2b2c507
--- /dev/null
+++ b/spacy/tests/lang/ms/test_prefix_suffix_infix.py
@@ -0,0 +1,112 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif)"])
+def test_ms_tokenizer_splits_no_special(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Ma'arif"])
+def test_ms_tokenizer_splits_no_punct(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 1
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif"])
+def test_ms_tokenizer_splits_prefix_punct(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["Ma'arif)"])
+def test_ms_tokenizer_splits_suffix_punct(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif)"])
+def test_ms_tokenizer_splits_even_wrap(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(Ma'arif?)"])
+def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
+def test_ms_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["S.Kom.)"])
+def test_ms_tokenizer_splits_suffix_interact(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(S.Kom.)"])
+def test_ms_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(S.Kom.?)"])
+def test_ms_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize(
+    "text,length",
+    [("kerana", 1), ("Mahathir-Anwar", 3), ("Tun Dr. Ismail-Abdul Rahman", 6)],
+)
+def test_my_tokenizer_splits_hyphens(ms_tokenizer, text, length):
+    tokens = ms_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_ms_tokenizer_splits_numeric_range(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["ini.Sani", "Halo.Malaysia"])
+def test_ms_tokenizer_splits_period_infix(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Halo,Malaysia", "satu,dua"])
+def test_ms_tokenizer_splits_comma_infix(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == text.split(",")[0]
+    assert tokens[1].text == ","
+    assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize("text", ["halo...Malaysia", "dia...pergi"])
+def test_ms_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+def test_ms_tokenizer_splits_double_hyphen_infix(id_tokenizer):
+    tokens = id_tokenizer("Arsene Wenger--pengurus Arsenal--mengadakan sidang media.")
+    assert len(tokens) == 10
+    assert tokens[0].text == "Arsene"
+    assert tokens[1].text == "Wenger"
+    assert tokens[2].text == "--"
+    assert tokens[3].text == "pengurus"
+    assert tokens[4].text == "Arsenal"
+    assert tokens[5].text == "--"
+    assert tokens[6].text == "mengadakan"
+    assert tokens[7].text == "sidang"
+    assert tokens[8].text == "media"
+    assert tokens[9].text == "."
diff --git a/spacy/tests/lang/ms/test_text.py b/spacy/tests/lang/ms/test_text.py
new file mode 100644
index 000000000..4b0ac3b2b
--- /dev/null
+++ b/spacy/tests/lang/ms/test_text.py
@@ -0,0 +1,9 @@
+import pytest
+
+from spacy.lang.ms.lex_attrs import like_num
+
+
+@pytest.mark.parametrize("word", ["sebelas"])
+def test_ms_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
diff --git a/spacy/tests/lang/nb/test_tokenizer.py b/spacy/tests/lang/nb/test_tokenizer.py
index 2da6e8d40..4f5fd89a3 100644
--- a/spacy/tests/lang/nb/test_tokenizer.py
+++ b/spacy/tests/lang/nb/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 NB_TOKEN_EXCEPTION_TESTS = [
     (
         "Smørsausen brukes bl.a. til fisk",
diff --git a/spacy/tests/lang/nl/test_noun_chunks.py b/spacy/tests/lang/nl/test_noun_chunks.py
index 8962e3b75..6004ac230 100644
--- a/spacy/tests/lang/nl/test_noun_chunks.py
+++ b/spacy/tests/lang/nl/test_noun_chunks.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc
 from spacy.util import filter_spans
 
diff --git a/spacy/tests/lang/nl/test_text.py b/spacy/tests/lang/nl/test_text.py
index 8bc72cc6d..d6413e0d7 100644
--- a/spacy/tests/lang/nl/test_text.py
+++ b/spacy/tests/lang/nl/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.nl.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/pt/test_noun_chunks.py b/spacy/tests/lang/pt/test_noun_chunks.py
index 9a42ce268..eee96d593 100644
--- a/spacy/tests/lang/pt/test_noun_chunks.py
+++ b/spacy/tests/lang/pt/test_noun_chunks.py
@@ -1,6 +1,7 @@
-from spacy.tokens import Doc
 import pytest
 
+from spacy.tokens import Doc
+
 
 # fmt: off
 @pytest.mark.parametrize(
diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py
index 3a9162b80..cb8723901 100644
--- a/spacy/tests/lang/pt/test_text.py
+++ b/spacy/tests/lang/pt/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.pt.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/ro/test_tokenizer.py b/spacy/tests/lang/ro/test_tokenizer.py
index 64c072470..d2affd607 100644
--- a/spacy/tests/lang/ro/test_tokenizer.py
+++ b/spacy/tests/lang/ro/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 TEST_CASES = [
     (
         "Adresa este str. Principală nr. 5.",
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index 9a5a9ad68..66aa7e3a6 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -1,6 +1,6 @@
 import pytest
-from spacy.tokens import Doc
 
+from spacy.tokens import Doc
 
 pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
 
diff --git a/spacy/tests/lang/ru/test_text.py b/spacy/tests/lang/ru/test_text.py
index b0eaf66bb..0bbed2122 100644
--- a/spacy/tests/lang/ru/test_text.py
+++ b/spacy/tests/lang/ru/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.ru.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py
index 083b55a09..c941e21fc 100644
--- a/spacy/tests/lang/ru/test_tokenizer.py
+++ b/spacy/tests/lang/ru/test_tokenizer.py
@@ -1,6 +1,6 @@
 from string import punctuation
-import pytest
 
+import pytest
 
 PUNCT_OPEN = ["(", "[", "{", "*"]
 PUNCT_CLOSE = [")", "]", "}", "*"]
diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py
index fdcf790d8..7ecd9596b 100644
--- a/spacy/tests/lang/sr/test_tokenizer.py
+++ b/spacy/tests/lang/sr/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 PUNCT_OPEN = ["(", "[", "{", "*"]
 PUNCT_CLOSE = [")", "]", "}", "*"]
 PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py
index 656c4706b..a47b17b27 100644
--- a/spacy/tests/lang/sv/test_lex_attrs.py
+++ b/spacy/tests/lang/sv/test_lex_attrs.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.sv.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index d2410156c..599148384 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc
 
 
diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py
index 8871f4414..f19c6b66f 100644
--- a/spacy/tests/lang/sv/test_tokenizer.py
+++ b/spacy/tests/lang/sv/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 SV_TOKEN_EXCEPTION_TESTS = [
     (
         "Smörsåsen används bl.a. till fisk",
diff --git a/spacy/tests/lang/ta/test_text.py b/spacy/tests/lang/ta/test_text.py
index 228a14c18..2d15e96fc 100644
--- a/spacy/tests/lang/ta/test_text.py
+++ b/spacy/tests/lang/ta/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.ta import Tamil
 
 # Wikipedia excerpt: https://en.wikipedia.org/wiki/Chennai (Tamil Language)
diff --git a/spacy/tests/lang/ta/test_tokenizer.py b/spacy/tests/lang/ta/test_tokenizer.py
index 6ba8a2400..e668b5aca 100644
--- a/spacy/tests/lang/ta/test_tokenizer.py
+++ b/spacy/tests/lang/ta/test_tokenizer.py
@@ -1,6 +1,7 @@
 import pytest
-from spacy.symbols import ORTH
+
 from spacy.lang.ta import Tamil
+from spacy.symbols import ORTH
 
 TA_BASIC_TOKENIZATION_TESTS = [
     (
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index 1c27c1744..fd96e8f9b 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -1,10 +1,15 @@
 import pytest
-from spacy.attrs import intify_attrs, ENT_IOB
 
-from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
+from spacy.attrs import ENT_IOB, IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
 from spacy.lang.en.stop_words import STOP_WORDS
-from spacy.lang.lex_attrs import is_ascii, is_currency, is_punct, is_stop
-from spacy.lang.lex_attrs import like_url, word_shape
+from spacy.lang.lex_attrs import (
+    is_ascii,
+    is_currency,
+    is_punct,
+    is_stop,
+    like_url,
+    word_shape,
+)
 
 
 @pytest.mark.parametrize("word", ["the"])
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 36f4a75e0..8a158647a 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -1,6 +1,6 @@
 import pytest
-from spacy.util import get_lang_class
 
+from spacy.util import get_lang_class
 
 # fmt: off
 # Only include languages with no external dependencies
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index e419f0a14..ddb3336ff 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -1,9 +1,9 @@
 import pytest
+
 from spacy import registry
 from spacy.lookups import Lookups
 from spacy.util import get_lang_class
 
-
 # fmt: off
 # Only include languages with no external dependencies
 # excluded: ru, uk
diff --git a/spacy/tests/lang/th/test_serialize.py b/spacy/tests/lang/th/test_serialize.py
index a3de4bf54..57d0f1726 100644
--- a/spacy/tests/lang/th/test_serialize.py
+++ b/spacy/tests/lang/th/test_serialize.py
@@ -1,6 +1,7 @@
 import pickle
 
 from spacy.lang.th import Thai
+
 from ...util import make_tempdir
 
 
diff --git a/spacy/tests/lang/tl/test_punct.py b/spacy/tests/lang/tl/test_punct.py
index d6bcf297d..e2c93bf88 100644
--- a/spacy/tests/lang/tl/test_punct.py
+++ b/spacy/tests/lang/tl/test_punct.py
@@ -1,7 +1,7 @@
 import pytest
-from spacy.util import compile_prefix_regex
-from spacy.lang.punctuation import TOKENIZER_PREFIXES
 
+from spacy.lang.punctuation import TOKENIZER_PREFIXES
+from spacy.util import compile_prefix_regex
 
 PUNCT_OPEN = ["(", "[", "{", "*"]
 PUNCT_CLOSE = [")", "]", "}", "*"]
diff --git a/spacy/tests/lang/tl/test_text.py b/spacy/tests/lang/tl/test_text.py
index 17429617c..26635ca90 100644
--- a/spacy/tests/lang/tl/test_text.py
+++ b/spacy/tests/lang/tl/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.tl.lex_attrs import like_num
 
 # https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py
index 323b11bd1..b4d84daae 100644
--- a/spacy/tests/lang/tr/test_text.py
+++ b/spacy/tests/lang/tr/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.tr.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/tr/test_tokenizer.py b/spacy/tests/lang/tr/test_tokenizer.py
index 9f988eae9..b07c98535 100644
--- a/spacy/tests/lang/tr/test_tokenizer.py
+++ b/spacy/tests/lang/tr/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 ABBREV_TESTS = [
     ("Dr. Murat Bey ile görüştüm.", ["Dr.", "Murat", "Bey", "ile", "görüştüm", "."]),
     ("Dr.la görüştüm.", ["Dr.la", "görüştüm", "."]),
diff --git a/spacy/tests/lang/tt/test_tokenizer.py b/spacy/tests/lang/tt/test_tokenizer.py
index 246d2824d..0bb241f27 100644
--- a/spacy/tests/lang/tt/test_tokenizer.py
+++ b/spacy/tests/lang/tt/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 INFIX_HYPHEN_TESTS = [
     ("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
     ("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()),
diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py
index a65bb25e5..060114cdf 100644
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@@ -1,6 +1,6 @@
 import pytest
-from spacy.tokens import Doc
 
+from spacy.tokens import Doc
 
 pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
 
diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py
index 6596f490a..7960a30a2 100644
--- a/spacy/tests/lang/uk/test_tokenizer.py
+++ b/spacy/tests/lang/uk/test_tokenizer.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 PUNCT_OPEN = ["(", "[", "{", "*"]
 PUNCT_CLOSE = [")", "]", "}", "*"]
 PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py
index 55dab799c..20bfd20d5 100644
--- a/spacy/tests/lang/vi/test_serialize.py
+++ b/spacy/tests/lang/vi/test_serialize.py
@@ -1,6 +1,7 @@
 import pickle
 
 from spacy.lang.vi import Vietnamese
+
 from ...util import make_tempdir
 
 
diff --git a/spacy/tests/lang/vi/test_tokenizer.py b/spacy/tests/lang/vi/test_tokenizer.py
index 3d0642d1e..ca6dee985 100644
--- a/spacy/tests/lang/vi/test_tokenizer.py
+++ b/spacy/tests/lang/vi/test_tokenizer.py
@@ -1,8 +1,8 @@
 import pytest
 
-from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
 from spacy.lang.vi import Vietnamese
 
+from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
 
 # fmt: off
 TOKENIZER_TESTS = [
diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py
index 48b689f3d..a1bbc38da 100644
--- a/spacy/tests/lang/yo/test_text.py
+++ b/spacy/tests/lang/yo/test_text.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lang.yo.lex_attrs import like_num
 
 
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index 03cdbbe24..4b014d713 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -1,5 +1,7 @@
 import pytest
+
 from spacy.lang.zh import Chinese
+
 from ...util import make_tempdir
 
 
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
index 741eb0ace..cdba5e397 100644
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -1,7 +1,7 @@
 import pytest
-from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
 from thinc.api import ConfigValidationError
 
+from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
 
 # fmt: off
 TEXTS = ("作为语言而言，为世界使用人数最多的语言，目前世界有五分之一人口做为母语。",)
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 200384320..44b3bb26b 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -1,8 +1,10 @@
-import pytest
+import copy
 import pickle
 import re
-import copy
+
+import pytest
 from mock import Mock
+
 from spacy.matcher import DependencyMatcher
 from spacy.tokens import Doc, Token
 
diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py
index 5afb7e1fc..fd85579ae 100644
--- a/spacy/tests/matcher/test_levenshtein.py
+++ b/spacy/tests/matcher/test_levenshtein.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.matcher import levenshtein
 from spacy.matcher.levenshtein import levenshtein_compare
 
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 09ab6c7dc..c824ca392 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -1,7 +1,8 @@
 import pytest
 from mock import Mock
+
 from spacy.matcher import Matcher
-from spacy.tokens import Doc, Token, Span
+from spacy.tokens import Doc, Span, Token
 
 from ..doc.test_underscore import clean_underscore  # noqa: F401
 
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index e7eced02c..21fa36865 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -1,6 +1,7 @@
 import pytest
-from spacy.matcher import Matcher
+
 from spacy.errors import MatchPatternError
+from spacy.matcher import Matcher
 from spacy.schemas import validate_token_pattern
 
 # (pattern, num errors with validation, num errors identified with minimal
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 8a8d9eb84..7335bbdf1 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -1,14 +1,14 @@
-import pytest
 import warnings
+
+import pytest
 import srsly
 from mock import Mock
 
 from spacy.lang.en import English
-from spacy.matcher import PhraseMatcher, Matcher
+from spacy.matcher import Matcher, PhraseMatcher
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 
-
 from ..util import make_tempdir
 
 
diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py
index 0693da690..ae20f9ba8 100644
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.morphology import Morphology
 from spacy.strings import StringStore, get_string_id
 
diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py
index d9b0e3476..5c1a8a31e 100644
--- a/spacy/tests/morphology/test_morph_pickle.py
+++ b/spacy/tests/morphology/test_morph_pickle.py
@@ -1,5 +1,7 @@
-import pytest
 import pickle
+
+import pytest
+
 from spacy.morphology import Morphology
 from spacy.strings import StringStore
 
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index b403f274f..9e83d5fb1 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -13,6 +13,7 @@ def test_build_dependencies():
         "hypothesis",
         "pre-commit",
         "black",
+        "isort",
         "mypy",
         "types-dataclasses",
         "types-mock",
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index f89e993e9..89626597d 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -1,14 +1,15 @@
 import pytest
 from thinc.api import Adam, fix_random_seed
+
 from spacy import registry
-from spacy.language import Language
 from spacy.attrs import NORM
-from spacy.vocab import Vocab
-from spacy.training import Example
-from spacy.tokens import Doc
+from spacy.language import Language
 from spacy.pipeline import DependencyParser, EntityRecognizer
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index bb226f9c5..fafd23268 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -1,12 +1,13 @@
 import pytest
-from spacy.vocab import Vocab
+
 from spacy import registry
-from spacy.training import Example
 from spacy.pipeline import DependencyParser
-from spacy.tokens import Doc
-from spacy.pipeline._parser_internals.nonproj import projectivize
 from spacy.pipeline._parser_internals.arc_eager import ArcEager
+from spacy.pipeline._parser_internals.nonproj import projectivize
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.vocab import Vocab
 
 
 def get_sequence_costs(M, words, heads, deps, transitions):
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 030182a63..1509c31bb 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,21 +1,21 @@
+import logging
 import random
 
 import pytest
 from numpy.testing import assert_equal
 
+from spacy import registry, util
 from spacy.attrs import ENT_IOB
-from spacy import util, registry
 from spacy.lang.en import English
 from spacy.lang.it import Italian
 from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.pipeline import EntityRecognizer
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
-from spacy.training import Example, iob_to_biluo, split_bilu_label
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.tokens import Doc, Span
+from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
@@ -728,9 +728,9 @@ def test_neg_annotation(neg_key):
     ner.add_label("ORG")
     example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
     example.reference.spans[neg_key] = [
-        Span(neg_doc, 2, 4, "ORG"),
-        Span(neg_doc, 2, 3, "PERSON"),
-        Span(neg_doc, 1, 4, "PERSON"),
+        Span(example.reference, 2, 4, "ORG"),
+        Span(example.reference, 2, 3, "PERSON"),
+        Span(example.reference, 1, 4, "PERSON"),
     ]
 
     optimizer = nlp.initialize()
@@ -755,7 +755,7 @@ def test_neg_annotation_conflict(neg_key):
     ner.add_label("PERSON")
     ner.add_label("LOC")
     example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
-    example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
+    example.reference.spans[neg_key] = [Span(example.reference, 2, 4, "PERSON")]
     assert len(example.reference.ents) == 1
     assert example.reference.ents[0].text == "Shaka Khan"
     assert example.reference.ents[0].label_ == "PERSON"
@@ -788,7 +788,7 @@ def test_beam_valid_parse(neg_key):
 
     doc = Doc(nlp.vocab, words=tokens)
     example = Example.from_dict(doc, {"ner": iob})
-    neg_span = Span(doc, 50, 53, "ORG")
+    neg_span = Span(example.reference, 50, 53, "ORG")
     example.reference.spans[neg_key] = [neg_span]
 
     optimizer = nlp.initialize()
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 1bb5d4aa5..5bef5758f 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -1,14 +1,14 @@
 import pytest
+from thinc.api import Model
 
 from spacy import registry
-from spacy.training import Example
-from spacy.vocab import Vocab
 from spacy.pipeline._parser_internals.arc_eager import ArcEager
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.pipeline.transition_parser import Parser
 from spacy.tokens.doc import Doc
-from thinc.api import Model
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.training import Example
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index 4ba020ef0..f852e5cda 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -1,16 +1,17 @@
-import pytest
 import hypothesis
 import hypothesis.strategies
 import numpy
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.pipeline._parser_internals.arc_eager import ArcEager
-from spacy.tokens import Doc
-from spacy.pipeline._parser_internals._beam_utils import BeamBatch
-from spacy.pipeline._parser_internals.stateclass import StateClass
-from spacy.training import Example
+import pytest
 from thinc.tests.strategies import ndarrays_of_shape
 
+from spacy.language import Language
+from spacy.pipeline._parser_internals._beam_utils import BeamBatch
+from spacy.pipeline._parser_internals.arc_eager import ArcEager
+from spacy.pipeline._parser_internals.stateclass import StateClass
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.vocab import Vocab
+
 
 @pytest.fixture(scope="module")
 def vocab():
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 051d0ef0c..f4e09fc91 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -1,7 +1,12 @@
 import pytest
-from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
-from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
+
 from spacy.pipeline._parser_internals import nonproj
+from spacy.pipeline._parser_internals.nonproj import (
+    ancestors,
+    contains_cycle,
+    is_nonproj_arc,
+    is_nonproj_tree,
+)
 from spacy.tokens import Doc
 
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 4b05c6721..3565c62af 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -5,12 +5,12 @@ from thinc.api import Adam
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.tokens import Doc
-from spacy.training import Example
-from spacy.vocab import Vocab
 from spacy.pipeline import DependencyParser
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.vocab import Vocab
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index 50da60594..d2f684fdc 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc
 
 
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index d71388900..dcbb9679d 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -1,12 +1,13 @@
 import pytest
 from thinc.api import Adam
-from spacy.attrs import NORM
-from spacy.vocab import Vocab
+
 from spacy import registry
-from spacy.training import Example
+from spacy.attrs import NORM
+from spacy.pipeline import DependencyParser
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.tokens import Doc
-from spacy.pipeline import DependencyParser
+from spacy.training import Example
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 2b80272d6..30e66b37a 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc
 
 from ..util import apply_transition_sequence
diff --git a/spacy/tests/parser/test_state.py b/spacy/tests/parser/test_state.py
index ca1755c48..0febc3d09 100644
--- a/spacy/tests/parser/test_state.py
+++ b/spacy/tests/parser/test_state.py
@@ -1,8 +1,8 @@
 import pytest
 
+from spacy.pipeline._parser_internals.stateclass import StateClass
 from spacy.tokens.doc import Doc
 from spacy.vocab import Vocab
-from spacy.pipeline._parser_internals.stateclass import StateClass
 
 
 @pytest.fixture
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index df3d7dff5..503b501ce 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -1,7 +1,8 @@
+import pytest
+from mock import Mock
+
 from spacy.language import Language
 from spacy.pipe_analysis import get_attr_info, validate_attrs
-from mock import Mock
-import pytest
 
 
 def test_component_decorator_assigns():
diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py
index 869b8b874..d4feebd30 100644
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@@ -1,12 +1,13 @@
 from typing import Callable, Iterable, Iterator
-import pytest
 
+import pytest
 from thinc.api import Config
+
+from spacy.lang.en import English
 from spacy.language import Language
 from spacy.training import Example
 from spacy.training.loop import train
-from spacy.lang.en import English
-from spacy.util import registry, load_model_from_config
+from spacy.util import load_model_from_config, registry
 
 
 @pytest.fixture
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index dab3ebf57..06587b4be 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -1,10 +1,11 @@
-import pytest
 import numpy
-from spacy.training import Example
+import pytest
+
+from spacy import registry, util
 from spacy.lang.en import English
 from spacy.pipeline import AttributeRuler
-from spacy import util, registry
 from spacy.tokens import Doc
+from spacy.training import Example
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 128d75680..5a8f0aee2 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,16 +1,17 @@
 import pickle
+
+import hypothesis.strategies as st
 import pytest
 from hypothesis import given
-import hypothesis.strategies as st
+
 from spacy import util
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
-from spacy.training import Example
 from spacy.strings import StringStore
+from spacy.training import Example
 from spacy.util import make_tempdir
 
-
 TRAIN_DATA = [
     ("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
     ("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index fc960cb01..00771a0f0 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,12 +1,12 @@
-from typing import Callable, Iterable, Dict, Any, Tuple
+from typing import Any, Callable, Dict, Iterable, Tuple
 
 import pytest
 from numpy.testing import assert_equal
 
-from spacy import registry, util, Language
+from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.ml.models.entity_linker import build_span_maker
@@ -15,7 +15,7 @@ from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
-from spacy.tokens import Span, Doc
+from spacy.tokens import Doc, Span
 from spacy.training import Example
 from spacy.util import ensure_path
 from spacy.vocab import Vocab
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 417f930cb..d0ab00391 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -1,16 +1,14 @@
 import pytest
+from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.pipeline import EntityRecognizer, EntityRuler, SpanRuler, merge_entities
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.tests.util import make_tempdir
-
-from thinc.api import NumpyOps, get_current_ops
+from spacy.tokens import Doc, Span
 
 ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
 
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index e4adfe2fe..f4db4ee98 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -1,7 +1,8 @@
 import pytest
-from spacy.pipeline.functions import merge_subtokens
+
 from spacy.language import Language
-from spacy.tokens import Span, Doc
+from spacy.pipeline.functions import merge_subtokens
+from spacy.tokens import Doc, Span
 
 from ..doc.test_underscore import clean_underscore  # noqa: F401
 
diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index c9b514770..6dd4114f1 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -1,9 +1,10 @@
 import pytest
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.training import Example
-from thinc.api import ConfigValidationError
 from pydantic import StrictBool
+from thinc.api import ConfigValidationError
+
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.training import Example
 
 
 def test_initialize_arguments():
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 0d2d3d6e5..ccc2e0b15 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -1,6 +1,8 @@
-import pytest
 import pickle
-from spacy import util, registry
+
+import pytest
+
+from spacy import registry, util
 from spacy.lang.en import English
 from spacy.lookups import Lookups
 
diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py
index e3fd28d0f..fef0017a8 100644
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@@ -3,7 +3,6 @@ from typing import List
 import numpy
 import pytest
 from numpy.testing import assert_almost_equal
-from spacy.vocab import Vocab
 from thinc.api import Model, data_validation, get_current_ops
 from thinc.types import Array2d, Ragged
 
@@ -11,7 +10,7 @@ from spacy.lang.en import English
 from spacy.ml import FeatureExtractor, StaticVectors
 from spacy.ml._character_embed import CharacterEmbed
 from spacy.tokens import Doc
-
+from spacy.vocab import Vocab
 
 OPS = get_current_ops()
 
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 8ce74ccfa..0d895f236 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,14 +1,15 @@
 import pytest
-from numpy.testing import assert_equal, assert_almost_equal
+from numpy.testing import assert_almost_equal, assert_equal
+from thinc.api import get_current_ops
 
 from spacy import util
-from spacy.training import Example
+from spacy.attrs import MORPH
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.tests.util import make_tempdir
 from spacy.morphology import Morphology
-from spacy.attrs import MORPH
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc
+from spacy.training import Example
 
 
 def test_label_types():
@@ -52,8 +53,9 @@ def test_label_smoothing():
     tag_scores, bp_tag_scores = morph_ls.model.begin_update(
         [eg.predicted for eg in train_examples]
     )
-    no_ls_grads = morph_no_ls.get_loss(train_examples, tag_scores)[1][0]
-    ls_grads = morph_ls.get_loss(train_examples, tag_scores)[1][0]
+    ops = get_current_ops()
+    no_ls_grads = ops.to_numpy(morph_no_ls.get_loss(train_examples, tag_scores)[1][0])
+    ls_grads = ops.to_numpy(morph_ls.get_loss(train_examples, tag_scores)[1][0])
     assert_almost_equal(ls_grads / no_ls_grads, 0.94285715)
 
 
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 232b0512e..0f1454b55 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,14 +1,14 @@
 import pytest
+from pydantic import StrictInt, StrictStr
+from thinc.api import ConfigValidationError, Linear, Model
 
 import spacy
-from spacy.language import Language
-from spacy.lang.en import English
 from spacy.lang.de import German
+from spacy.lang.en import English
+from spacy.language import Language
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
-from spacy.util import registry, SimpleFrozenDict, combine_score_weights
-from thinc.api import Model, Linear, ConfigValidationError
-from pydantic import StrictInt, StrictStr
+from spacy.util import SimpleFrozenDict, combine_score_weights, registry
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 5dd0fef43..9b1ddd530 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -1,8 +1,9 @@
 import pytest
+
 import spacy
+from spacy.lang.en import English
 from spacy.pipeline import Sentencizer
 from spacy.tokens import Doc
-from spacy.lang.en import English
 
 
 def test_sentencizer(en_vocab):
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 047f59bef..6c7655812 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,12 +1,12 @@
 import pytest
 from numpy.testing import assert_equal
-from spacy.attrs import SENT_START
 
 from spacy import util
-from spacy.training import Example
+from spacy.attrs import SENT_START
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
+from spacy.training import Example
 
 
 def test_label_types():
diff --git a/spacy/tests/pipeline/test_span_finder.py b/spacy/tests/pipeline/test_span_finder.py
new file mode 100644
index 000000000..47a8a34a8
--- /dev/null
+++ b/spacy/tests/pipeline/test_span_finder.py
@@ -0,0 +1,240 @@
+import pytest
+from thinc.api import Config
+
+from spacy import util
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.pipeline.span_finder import span_finder_default_config
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.util import fix_random_seed, make_tempdir, registry
+
+SPANS_KEY = "pytest"
+TRAIN_DATA = [
+    ("Who is Shaka Khan?", {"spans": {SPANS_KEY: [(7, 17)]}}),
+    (
+        "I like London and Berlin.",
+        {"spans": {SPANS_KEY: [(7, 13), (18, 24)]}},
+    ),
+]
+
+TRAIN_DATA_OVERLAPPING = [
+    ("Who is Shaka Khan?", {"spans": {SPANS_KEY: [(7, 17)]}}),
+    (
+        "I like London and Berlin",
+        {"spans": {SPANS_KEY: [(7, 13), (18, 24), (7, 24)]}},
+    ),
+    ("", {"spans": {SPANS_KEY: []}}),
+]
+
+
+def make_examples(nlp, data=TRAIN_DATA):
+    train_examples = []
+    for t in data:
+        eg = Example.from_dict(nlp.make_doc(t[0]), t[1])
+        train_examples.append(eg)
+    return train_examples
+
+
+@pytest.mark.parametrize(
+    "tokens_predicted, tokens_reference, reference_truths",
+    [
+        (
+            ["Mon", ".", "-", "June", "16"],
+            ["Mon.", "-", "June", "16"],
+            [(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
+        ),
+        (
+            ["Mon.", "-", "J", "une", "16"],
+            ["Mon.", "-", "June", "16"],
+            [(0, 0), (0, 0), (1, 0), (0, 1), (0, 0)],
+        ),
+        (
+            ["Mon", ".", "-", "June", "16"],
+            ["Mon.", "-", "June", "1", "6"],
+            [(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
+        ),
+        (
+            ["Mon.", "-J", "un", "e 16"],
+            ["Mon.", "-", "June", "16"],
+            [(0, 0), (0, 0), (0, 0), (0, 0)],
+        ),
+        pytest.param(
+            ["Mon.-June", "16"],
+            ["Mon.", "-", "June", "16"],
+            [(0, 1), (0, 0)],
+        ),
+        pytest.param(
+            ["Mon.-", "June", "16"],
+            ["Mon.", "-", "J", "une", "16"],
+            [(0, 0), (1, 1), (0, 0)],
+        ),
+        pytest.param(
+            ["Mon.-", "June 16"],
+            ["Mon.", "-", "June", "16"],
+            [(0, 0), (1, 0)],
+        ),
+    ],
+)
+def test_loss_alignment_example(tokens_predicted, tokens_reference, reference_truths):
+    nlp = Language()
+    predicted = Doc(
+        nlp.vocab, words=tokens_predicted, spaces=[False] * len(tokens_predicted)
+    )
+    reference = Doc(
+        nlp.vocab, words=tokens_reference, spaces=[False] * len(tokens_reference)
+    )
+    example = Example(predicted, reference)
+    example.reference.spans[SPANS_KEY] = [example.reference.char_span(5, 9)]
+    span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
+    nlp.initialize()
+    ops = span_finder.model.ops
+    if predicted.text != reference.text:
+        with pytest.raises(
+            ValueError, match="must match between reference and predicted"
+        ):
+            span_finder._get_aligned_truth_scores([example], ops)
+        return
+    truth_scores, masks = span_finder._get_aligned_truth_scores([example], ops)
+    assert len(truth_scores) == len(tokens_predicted)
+    ops.xp.testing.assert_array_equal(truth_scores, ops.xp.asarray(reference_truths))
+
+
+def test_span_finder_model():
+    nlp = Language()
+
+    docs = [nlp("This is an example."), nlp("This is the second example.")]
+    docs[0].spans[SPANS_KEY] = [docs[0][3:4]]
+    docs[1].spans[SPANS_KEY] = [docs[1][3:5]]
+
+    total_tokens = 0
+    for doc in docs:
+        total_tokens += len(doc)
+
+    config = Config().from_str(span_finder_default_config).interpolate()
+    model = registry.resolve(config)["model"]
+
+    model.initialize(X=docs)
+    predictions = model.predict(docs)
+
+    assert len(predictions) == total_tokens
+    assert len(predictions[0]) == 2
+
+
+def test_span_finder_component():
+    nlp = Language()
+
+    docs = [nlp("This is an example."), nlp("This is the second example.")]
+    docs[0].spans[SPANS_KEY] = [docs[0][3:4]]
+    docs[1].spans[SPANS_KEY] = [docs[1][3:5]]
+
+    span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
+    nlp.initialize()
+    docs = list(span_finder.pipe(docs))
+
+    assert SPANS_KEY in docs[0].spans
+
+
+@pytest.mark.parametrize(
+    "min_length, max_length, span_count",
+    [(0, 0, 0), (None, None, 8), (2, None, 6), (None, 1, 2), (2, 3, 2)],
+)
+def test_set_annotations_span_lengths(min_length, max_length, span_count):
+    nlp = Language()
+    doc = nlp("Me and Jenny goes together like peas and carrots.")
+    if min_length == 0 and max_length == 0:
+        with pytest.raises(ValueError, match="Both 'min_length' and 'max_length'"):
+            span_finder = nlp.add_pipe(
+                "span_finder",
+                config={
+                    "max_length": max_length,
+                    "min_length": min_length,
+                    "spans_key": SPANS_KEY,
+                },
+            )
+        return
+    span_finder = nlp.add_pipe(
+        "span_finder",
+        config={
+            "max_length": max_length,
+            "min_length": min_length,
+            "spans_key": SPANS_KEY,
+        },
+    )
+    nlp.initialize()
+    # Starts    [Me, Jenny, peas]
+    # Ends      [Jenny, peas, carrots]
+    scores = [
+        (1, 0),
+        (0, 0),
+        (1, 1),
+        (0, 0),
+        (0, 0),
+        (0, 0),
+        (1, 1),
+        (0, 0),
+        (0, 1),
+        (0, 0),
+    ]
+    span_finder.set_annotations([doc], scores)
+
+    assert doc.spans[SPANS_KEY]
+    assert len(doc.spans[SPANS_KEY]) == span_count
+
+    # Assert below will fail when max_length is set to 0
+    if max_length is None:
+        max_length = float("inf")
+    if min_length is None:
+        min_length = 1
+
+    assert all(min_length <= len(span) <= max_length for span in doc.spans[SPANS_KEY])
+
+
+def test_overfitting_IO():
+    # Simple test to try and quickly overfit the span_finder component - ensuring the ML models work correctly
+    fix_random_seed(0)
+    nlp = English()
+    span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
+    train_examples = make_examples(nlp)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    assert span_finder.model.get_dim("nO") == 2
+
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["span_finder"] < 0.001
+
+    # test the trained model
+    test_text = "I like London and Berlin"
+    doc = nlp(test_text)
+    spans = doc.spans[SPANS_KEY]
+    assert len(spans) == 3
+    assert set([span.text for span in spans]) == {
+        "London",
+        "Berlin",
+        "London and Berlin",
+    }
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        spans2 = doc2.spans[SPANS_KEY]
+        assert len(spans2) == 3
+        assert set([span.text for span in spans2]) == {
+            "London",
+            "Berlin",
+            "London and Berlin",
+        }
+
+    # Test scoring
+    scores = nlp.evaluate(train_examples)
+    assert f"spans_{SPANS_KEY}_f" in scores
+    # It's not perfect 1.0 F1 because it's designed to overgenerate for now.
+    assert scores[f"spans_{SPANS_KEY}_p"] == 0.75
+    assert scores[f"spans_{SPANS_KEY}_r"] == 1.0
+
+    # also test that the spancat works for just a single entity in a sentence
+    doc = nlp("London")
+    assert len(doc.spans[SPANS_KEY]) == 1
diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py
index 794815359..0a8616f44 100644
--- a/spacy/tests/pipeline/test_span_ruler.py
+++ b/spacy/tests/pipeline/test_span_ruler.py
@@ -1,13 +1,12 @@
 import pytest
+from thinc.api import NumpyOps, get_current_ops
 
 import spacy
 from spacy import registry
 from spacy.errors import MatchPatternError
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Span
 from spacy.training import Example
-from spacy.tests.util import make_tempdir
-
-from thinc.api import NumpyOps, get_current_ops
 
 
 @pytest.fixture
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index cf6304042..9405a78e0 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,7 @@
-import pytest
 import numpy
-from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged
+import pytest
+from numpy.testing import assert_almost_equal, assert_array_equal
+from thinc.api import NumpyOps, Ragged, get_current_ops
 
 from spacy import util
 from spacy.lang.en import English
@@ -9,7 +9,7 @@ from spacy.language import Language
 from spacy.tokens import SpanGroup
 from spacy.tokens._dict_proxies import SpanGroups
 from spacy.training import Example
-from spacy.util import fix_random_seed, registry, make_tempdir
+from spacy.util import fix_random_seed, make_tempdir, registry
 
 OPS = get_current_ops()
 
@@ -190,17 +190,19 @@ def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
     spangroup = spancat._make_span_group_singlelabel(
         doc, indices, scores, allow_overlap
     )
-    assert len(spangroup) == nr_results
     if threshold > 0.4:
         if allow_overlap:
             assert spangroup[0].text == "London"
             assert spangroup[0].label_ == "City"
+            assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
             assert spangroup[1].text == "Greater London"
             assert spangroup[1].label_ == "GreatCity"
-
+            assert spangroup.attrs["scores"][1] == 0.9
+            assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
         else:
             assert spangroup[0].text == "Greater London"
             assert spangroup[0].label_ == "GreatCity"
+            assert spangroup.attrs["scores"][0] == 0.9
     else:
         if allow_overlap:
             assert spangroup[0].text == "Greater"
@@ -256,22 +258,32 @@ def test_make_spangroup_negative_label():
     assert len(spangroup_single) == 2
     assert spangroup_single[0].text == "Greater"
     assert spangroup_single[0].label_ == "City"
+    assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
     assert spangroup_single[1].text == "Greater London"
     assert spangroup_single[1].label_ == "GreatCity"
+    assert spangroup_single.attrs["scores"][1] == 0.9
+    assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
 
     assert len(spangroup_multi) == 6
     assert spangroup_multi[0].text == "Greater"
     assert spangroup_multi[0].label_ == "City"
+    assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
     assert spangroup_multi[1].text == "Greater"
     assert spangroup_multi[1].label_ == "Person"
+    assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
     assert spangroup_multi[2].text == "London"
     assert spangroup_multi[2].label_ == "City"
+    assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
     assert spangroup_multi[3].text == "London"
     assert spangroup_multi[3].label_ == "GreatCity"
+    assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
     assert spangroup_multi[4].text == "Greater London"
     assert spangroup_multi[4].label_ == "Thing"
+    assert spangroup_multi[4].text == "Greater London"
+    assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
     assert spangroup_multi[5].text == "Greater London"
     assert spangroup_multi[5].label_ == "GreatCity"
+    assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
 
 
 def test_ngram_suggester(en_tokenizer):
@@ -394,6 +406,21 @@ def test_ngram_sizes(en_tokenizer):
     assert_array_equal(OPS.to_numpy(ngrams_3.lengths), [0, 1, 3, 6, 9])
 
 
+def test_preset_spans_suggester():
+    nlp = Language()
+    docs = [nlp("This is an example."), nlp("This is the second example.")]
+    docs[0].spans[SPAN_KEY] = [docs[0][3:4]]
+    docs[1].spans[SPAN_KEY] = [docs[1][0:4], docs[1][3:5]]
+    suggester = registry.misc.get("spacy.preset_spans_suggester.v1")(spans_key=SPAN_KEY)
+    candidates = suggester(docs)
+    assert type(candidates) == Ragged
+    assert len(candidates) == 2
+    assert list(candidates.dataXd[0]) == [3, 4]
+    assert list(candidates.dataXd[1]) == [0, 4]
+    assert list(candidates.dataXd[2]) == [3, 5]
+    assert list(candidates.lengths) == [1, 2]
+
+
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
     fix_random_seed(0)
@@ -416,7 +443,7 @@ def test_overfitting_IO():
     spans = doc.spans[SPAN_KEY]
     assert len(spans) == 2
     assert len(spans.attrs["scores"]) == 2
-    assert min(spans.attrs["scores"]) > 0.9
+    assert min(spans.attrs["scores"]) > 0.8
     assert set([span.text for span in spans]) == {"London", "Berlin"}
     assert set([span.label_ for span in spans]) == {"LOC"}
 
@@ -428,7 +455,7 @@ def test_overfitting_IO():
         spans2 = doc2.spans[SPAN_KEY]
         assert len(spans2) == 2
         assert len(spans2.attrs["scores"]) == 2
-        assert min(spans2.attrs["scores"]) > 0.9
+        assert min(spans2.attrs["scores"]) > 0.8
         assert set([span.text for span in spans2]) == {"London", "Berlin"}
         assert set([span.label_ for span in spans2]) == {"LOC"}
 
@@ -565,3 +592,21 @@ def test_set_candidates(name):
     assert len(docs[0].spans["candidates"]) == 9
     assert docs[0].spans["candidates"][0].text == "Just"
     assert docs[0].spans["candidates"][4].text == "Just a"
+
+
+@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_spancat_multiprocessing(name, n_process):
+    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
+        nlp = Language()
+        spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
+        train_examples = make_examples(nlp)
+        nlp.initialize(get_examples=lambda: train_examples)
+        texts = [
+            "Just a sentence.",
+            "I like London and Berlin",
+            "I like Berlin",
+            "I eat ham.",
+        ]
+        docs = list(nlp.pipe(texts, n_process=n_process))
+        assert len(docs) == len(texts)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 0cc25a64b..4b5f1ee99 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,12 +1,12 @@
 import pytest
-from numpy.testing import assert_equal, assert_almost_equal
-from spacy.attrs import TAG
+from numpy.testing import assert_almost_equal, assert_equal
+from thinc.api import compounding, get_current_ops
 
 from spacy import util
-from spacy.training import Example
+from spacy.attrs import TAG
 from spacy.lang.en import English
 from spacy.language import Language
-from thinc.api import compounding
+from spacy.training import Example
 
 from ..util import make_tempdir
 
@@ -85,8 +85,9 @@ def test_label_smoothing():
     tag_scores, bp_tag_scores = tagger_ls.model.begin_update(
         [eg.predicted for eg in train_examples]
     )
-    no_ls_grads = tagger_no_ls.get_loss(train_examples, tag_scores)[1][0]
-    ls_grads = tagger_ls.get_loss(train_examples, tag_scores)[1][0]
+    ops = get_current_ops()
+    no_ls_grads = ops.to_numpy(tagger_no_ls.get_loss(train_examples, tag_scores)[1][0])
+    ls_grads = ops.to_numpy(tagger_ls.get_loss(train_examples, tag_scores)[1][0])
     assert_almost_equal(ls_grads / no_ls_grads, 0.925)
 
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index d042f3445..9ce5909f1 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -12,12 +12,16 @@ from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer
-from spacy.pipeline.textcat import single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat import single_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
+from spacy.pipeline.textcat import (
+    single_label_bow_config,
+    single_label_cnn_config,
+    single_label_default_config,
+)
+from spacy.pipeline.textcat_multilabel import (
+    multi_label_bow_config,
+    multi_label_cnn_config,
+    multi_label_default_config,
+)
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index e423d9a19..998f0472c 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -1,17 +1,21 @@
 import pytest
-from spacy.ml.models.tok2vec import build_Tok2Vec_model
-from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder
-from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
-from spacy.vocab import Vocab
-from spacy.tokens import Doc
-from spacy.training import Example
+from numpy.testing import assert_array_equal
+from thinc.api import Config, get_current_ops
+
 from spacy import util
 from spacy.lang.en import English
+from spacy.ml.models.tok2vec import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_Tok2Vec_model,
+)
+from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
+from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.util import registry
-from thinc.api import Config, get_current_ops
-from numpy.testing import assert_array_equal
+from spacy.vocab import Vocab
 
-from ..util import get_batch, make_tempdir, add_vecs_to_vocab
+from ..util import add_vecs_to_vocab, get_batch, make_tempdir
 
 
 def test_empty_doc():
@@ -188,8 +192,7 @@ def test_tok2vec_listener(with_vectors):
         for tag in t[1]["tags"]:
             tagger.add_label(tag)
 
-    # Check that the Tok2Vec component finds it listeners
-    assert tok2vec.listeners == []
+    # Check that the Tok2Vec component finds its listeners
     optimizer = nlp.initialize(lambda: train_examples)
     assert tok2vec.listeners == [tagger_tok2vec]
 
@@ -217,7 +220,6 @@ def test_tok2vec_listener_callback():
     assert nlp.pipe_names == ["tok2vec", "tagger"]
     tagger = nlp.get_pipe("tagger")
     tok2vec = nlp.get_pipe("tok2vec")
-    nlp._link_components()
     docs = [nlp.make_doc("A random sentence")]
     tok2vec.model.initialize(X=docs)
     gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
@@ -426,29 +428,46 @@ def test_replace_listeners_from_config():
         nlp.to_disk(dir_path)
         base_model = str(dir_path)
         new_config = {
-            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
+            "nlp": {
+                "lang": "en",
+                "pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
+            },
             "components": {
                 "tok2vec": {"source": base_model},
-                "tagger": {
+                "tagger2": {
                     "source": base_model,
+                    "component": "tagger",
                     "replace_listeners": ["model.tok2vec"],
                 },
-                "ner": {"source": base_model},
+                "ner3": {
+                    "source": base_model,
+                    "component": "ner",
+                },
+                "tagger4": {
+                    "source": base_model,
+                    "component": "tagger",
+                },
             },
         }
         new_nlp = util.load_model_from_config(new_config, auto_fill=True)
     new_nlp.initialize(lambda: examples)
     tok2vec = new_nlp.get_pipe("tok2vec")
-    tagger = new_nlp.get_pipe("tagger")
-    ner = new_nlp.get_pipe("ner")
-    assert tok2vec.listening_components == ["ner"]
+    tagger = new_nlp.get_pipe("tagger2")
+    ner = new_nlp.get_pipe("ner3")
+    assert "ner" not in new_nlp.pipe_names
+    assert "tagger" not in new_nlp.pipe_names
+    assert tok2vec.listening_components == ["ner3", "tagger4"]
     assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
     assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
     t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
     assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
-    assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
+    assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
     assert (
-        new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
+        new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
+        == "spacy.Tok2VecListener.v1"
+    )
+    assert (
+        new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
         == "spacy.Tok2VecListener.v1"
     )
 
@@ -540,3 +559,57 @@ def test_tok2vec_listeners_textcat():
     assert cats1["imperative"] < 0.9
     assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
     assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
+
+
+def test_tok2vec_listener_source_link_name():
+    """The component's internal name and the tok2vec listener map correspond
+    to the most recently modified pipeline.
+    """
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+
+    # there is no way to have the component have the right name for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+
+    # there is no way to have the tok2vec have the right listener map for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.add_pipe("ner", name="ner3", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
+    nlp2.remove_pipe("ner3")
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.remove_pipe("tagger2")
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+
+    # at this point the tok2vec component corresponds to nlp2
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
+    nlp1.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    # modifying nlp2 syncs it back to nlp2
+    nlp2.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+
+def test_tok2vec_listener_source_replace_listeners():
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
+    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("tagger", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("ner", name="ner2", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
diff --git a/spacy/tests/serialize/test_resource_warning.py b/spacy/tests/serialize/test_resource_warning.py
index 38701c6d9..ab6e6e9ee 100644
--- a/spacy/tests/serialize/test_resource_warning.py
+++ b/spacy/tests/serialize/test_resource_warning.py
@@ -1,12 +1,14 @@
 import warnings
 from unittest import TestCase
+
 import pytest
 import srsly
 from numpy import zeros
+
 from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
-from spacy.vectors import Vectors
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.vectors import Vectors
 from spacy.vocab import Vocab
 
 from ..util import make_tempdir
@@ -72,7 +74,7 @@ def entity_linker():
 
     def create_kb(vocab):
         kb = InMemoryLookupKB(vocab, entity_vector_length=1)
-        kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
+        kb.add_entity("test", 0.0, zeros((1,), dtype="f"))
         return kb
 
     entity_linker = nlp.add_pipe("entity_linker")
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 85e6f8b2c..b36d3ad74 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,13 +5,21 @@ from thinc.api import Config, ConfigValidationError
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.language import Language
-from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
-from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH, Language
+from spacy.ml.models import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_tb_parser_model,
+    build_Tok2Vec_model,
+)
 from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
-from spacy.util import load_config, load_config_from_str
-from spacy.util import load_model_from_config, registry
+from spacy.training import Example
+from spacy.util import (
+    load_config,
+    load_config_from_str,
+    load_model_from_config,
+    registry,
+)
 
 from ..util import make_tempdir
 
@@ -415,6 +423,55 @@ def test_config_overrides():
     assert nlp.pipe_names == ["tok2vec", "tagger"]
 
 
+@pytest.mark.filterwarnings("ignore:\\[W036")
+def test_config_overrides_registered_functions():
+    nlp = spacy.blank("en")
+    nlp.add_pipe("attribute_ruler")
+    with make_tempdir() as d:
+        nlp.to_disk(d)
+        nlp_re1 = spacy.load(
+            d,
+            config={
+                "components": {
+                    "attribute_ruler": {
+                        "scorer": {"@scorers": "spacy.tagger_scorer.v1"}
+                    }
+                }
+            },
+        )
+        assert (
+            nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
+            == "spacy.tagger_scorer.v1"
+        )
+
+        @registry.misc("test_some_other_key")
+        def misc_some_other_key():
+            return "some_other_key"
+
+        nlp_re2 = spacy.load(
+            d,
+            config={
+                "components": {
+                    "attribute_ruler": {
+                        "scorer": {
+                            "@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
+                            "spans_key": {"@misc": "test_some_other_key"},
+                        }
+                    }
+                }
+            },
+        )
+        assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
+            "spans_key"
+        ] == {"@misc": "test_some_other_key"}
+        # run dummy evaluation (will return None scores) in order to test that
+        # the spans_key value in the nested override is working as intended in
+        # the config
+        example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
+        scores = nlp_re2.evaluate([example])
+        assert "spans_some_other_key_f" in scores
+
+
 def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
     assert config["corpora"]["train"]["path"] == "${paths.train}"
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 15bf67bfd..eea13445e 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
 
 def test_serialize_doc_span_groups(en_vocab):
     doc = Doc(en_vocab, words=["hello", "world", "!"])
-    doc.spans["content"] = [doc[0:2]]
+    span = doc[0:2]
+    span.label_ = "test_serialize_doc_span_groups_label"
+    span.id_ = "test_serialize_doc_span_groups_id"
+    span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
+    doc.spans["content"] = [span]
     new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
     assert len(new_doc.spans["content"]) == 1
+    assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
+    assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
+    assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py
index 9f8e5e06b..6f7b1001c 100644
--- a/spacy/tests/serialize/test_serialize_docbin.py
+++ b/spacy/tests/serialize/test_serialize_docbin.py
@@ -49,7 +49,11 @@ def test_serialize_doc_bin():
     nlp = English()
     for doc in nlp.pipe(texts):
         doc.cats = cats
-        doc.spans["start"] = [doc[0:2]]
+        span = doc[0:2]
+        span.label_ = "UNUSUAL_SPAN_LABEL"
+        span.id_ = "UNUSUAL_SPAN_ID"
+        span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
+        doc.spans["start"] = [span]
         doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
         doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
         doc_bin.add(doc)
@@ -63,6 +67,9 @@ def test_serialize_doc_bin():
         assert doc.text == texts[i]
         assert doc.cats == cats
         assert len(doc.spans) == 1
+        assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
+        assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
+        assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
         assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
         assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
 
diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py
index 9cfa1a552..f3b6cb000 100644
--- a/spacy/tests/serialize/test_serialize_extension_attrs.py
+++ b/spacy/tests/serialize/test_serialize_extension_attrs.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc, Token
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index f9d2e226b..99eb8cd86 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,16 +1,16 @@
 from pathlib import Path
-from typing import Callable, Iterable, Any, Dict
+from typing import Any, Callable, Dict, Iterable
 
 import srsly
-
-from spacy import util, Errors
-from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
-from spacy.kb.kb_in_memory import InMemoryLookupKB
-from spacy.vocab import Vocab
+from numpy import zeros
 from thinc.api import Config
 
+from spacy import Errors, util
+from spacy.kb.kb_in_memory import InMemoryLookupKB
+from spacy.util import SimpleFrozenList, ensure_path, load_model_from_config, registry
+from spacy.vocab import Vocab
+
 from ..util import make_tempdir
-from numpy import zeros
 
 
 def test_serialize_kb_disk(en_vocab):
diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py
index c03287548..9c36015a9 100644
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@@ -1,11 +1,11 @@
-import re
 import pickle
+import re
 
 import pytest
 
-from spacy.language import Language
-from spacy.lang.it import Italian
 from spacy.lang.en import English
+from spacy.lang.it import Italian
+from spacy.language import Language
 from spacy.tokenizer import Tokenizer
 from spacy.training import Example
 from spacy.util import load_config_from_str
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 9fcf18e2d..6bbe743a1 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,15 +8,21 @@ import spacy
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler
-from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
-from spacy.pipeline import TrainablePipe
+from spacy.pipeline import (
+    DependencyParser,
+    EntityRecognizer,
+    EntityRuler,
+    SentenceRecognizer,
+    Tagger,
+    TextCategorizer,
+    TrainablePipe,
+)
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
-from spacy.util import ensure_path, load_model
 from spacy.tokens import Span
+from spacy.util import ensure_path, load_model
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index 9b74d7721..e998a78b4 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -7,8 +7,13 @@ from spacy.attrs import ENT_IOB, ENT_TYPE
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
-from spacy.util import compile_infix_regex, compile_prefix_regex
-from spacy.util import compile_suffix_regex, get_lang_class, load_model
+from spacy.util import (
+    compile_infix_regex,
+    compile_prefix_regex,
+    compile_suffix_regex,
+    get_lang_class,
+    load_model,
+)
 
 from ..util import assert_packed_msg_equal, make_tempdir
 
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
index 26eabd4e5..3b5804a69 100644
--- a/spacy/tests/test_architectures.py
+++ b/spacy/tests/test_architectures.py
@@ -1,7 +1,8 @@
 import pytest
-from spacy import registry
-from thinc.api import Linear
 from catalogue import RegistryError
+from thinc.api import Linear
+
+from spacy import registry
 
 
 def test_get_architecture():
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index c5dfdf7e5..7fe4aa08e 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,47 +1,62 @@
-import os
 import math
+import os
+import time
 from collections import Counter
-from typing import Tuple, List, Dict, Any
 from pathlib import Path
+from typing import Any, Dict, List, Tuple
 
-import spacy
+import numpy
 import pytest
 import srsly
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
 from thinc.api import Config, ConfigValidationError
-
 from weasel.cli.remote_storage import RemoteStorage
 from weasel.cli.run import _check_requirements
 
+import spacy
 from spacy import about
 from spacy.cli import info
-from spacy.cli._util import walk_directory
-from spacy.cli._util import parse_config_overrides, string_to_list
-from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
-from spacy.cli.debug_data import _get_labels_from_spancat
-from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
-from spacy.cli.debug_data import _get_span_characteristics
-from spacy.cli.debug_data import _print_span_characteristics
-from spacy.cli.debug_data import _get_spans_length_freq_dist
-from spacy.cli.download import get_compatibility, get_version
-from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
-from spacy.cli.init_pipeline import _init_labels
-from spacy.cli.package import get_third_party_dependencies
-from spacy.cli.package import _is_permitted_package_name
-from spacy.cli.validate import get_model_pkgs
+from spacy.cli._util import (
+    download_file,
+    is_subpath_of,
+    load_project_config,
+    parse_config_overrides,
+    string_to_list,
+    substitute_project_variables,
+    upload_file,
+    validate_project_commands,
+    walk_directory,
+)
 from spacy.cli.apply import apply
+from spacy.cli.debug_data import (
+    _compile_gold,
+    _get_distribution,
+    _get_kl_divergence,
+    _get_labels_from_model,
+    _get_labels_from_spancat,
+    _get_span_characteristics,
+    _get_spans_length_freq_dist,
+    _print_span_characteristics,
+)
+from spacy.cli.download import get_compatibility, get_version
+from spacy.cli.evaluate import render_parses
 from spacy.cli.find_threshold import find_threshold
+from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
+from spacy.cli.init_pipeline import _init_labels
+from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
+from spacy.cli.project.remote_storage import RemoteStorage
+from spacy.cli.project.run import _check_requirements
+from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import Language
-from spacy.schemas import RecommendationSchema
+from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.tokens import Doc, DocBin
 from spacy.tokens.span import Span
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
-from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
-from spacy.training.converters import iob_to_docs
-from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
+from spacy.training.converters import conll_ner_to_docs, conllu_to_docs, iob_to_docs
+from spacy.util import ENV_VARS, get_minor_version, load_config, load_model_from_config
 
 from .util import make_tempdir
 
@@ -121,6 +136,70 @@ def test_issue7055():
     assert "model" in filled_cfg["components"]["ner"]
 
 
+@pytest.mark.issue(12566)
+@pytest.mark.parametrize(
+    "factory,output_file",
+    [("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
+)
+def test_issue12566(factory: str, output_file: str):
+    """
+    Test if all displaCy types (ents, dep, spans) produce an HTML file
+    """
+    with make_tempdir() as tmp_dir:
+        # Create sample spaCy file
+        doc_json = {
+            "ents": [
+                {"end": 54, "label": "nam_adj_country", "start": 44},
+                {"end": 83, "label": "nam_liv_person", "start": 69},
+                {"end": 100, "label": "nam_pro_title_book", "start": 86},
+            ],
+            "spans": {
+                "sc": [
+                    {"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
+                    {"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
+                    {
+                        "end": 100,
+                        "kb_id": "",
+                        "label": "nam_pro_title_book",
+                        "start": 86,
+                    },
+                ]
+            },
+            "text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
+            "Briana McNaira - Cultural Chaos .",
+            "tokens": [
+                # fmt: off
+                {"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
+                {"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
+                {"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
+                {"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
+                {"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
+                {"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
+                {"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
+                {"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
+                {"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
+                {"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
+                {"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
+                {"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
+                {"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
+                {"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
+                {"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
+                # fmt: on
+            ],
+        }
+
+        # Create a .spacy file
+        nlp = spacy.blank("pl")
+        doc = Doc(nlp.vocab).from_json(doc_json)
+
+        # Run the evaluate command and check if the html files exist
+        render_parses(
+            docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
+        )
+
+        assert (tmp_dir / output_file).is_file()
+
+
 def test_cli_info():
     nlp = Dutch()
     nlp.add_pipe("textcat")
@@ -471,6 +550,7 @@ def test_string_to_list_intify(value):
     assert string_to_list(value, intify=True) == [1, 2, 3]
 
 
+@pytest.mark.skip(reason="Temporarily skip before models are published")
 def test_download_compatibility():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -481,6 +561,7 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
+@pytest.mark.skip(reason="Temporarily skip before models are published")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -617,7 +698,8 @@ def test_debug_data_compile_gold():
     assert data["boundary_cross_ents"] == 1
 
 
-def test_debug_data_compile_gold_for_spans():
+@pytest.mark.parametrize("component_name", ["spancat", "spancat_singlelabel"])
+def test_debug_data_compile_gold_for_spans(component_name):
     nlp = English()
     spans_key = "sc"
 
@@ -627,7 +709,7 @@ def test_debug_data_compile_gold_for_spans():
     ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
     eg = Example(pred, ref)
 
-    data = _compile_gold([eg], ["spancat"], nlp, True)
+    data = _compile_gold([eg], [component_name], nlp, True)
 
     assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1})
     assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]}
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 9ba4f0e5c..3a426113b 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,11 +1,13 @@
 import os
 from pathlib import Path
+
 import pytest
 import srsly
 from typer.testing import CliRunner
-from spacy.tokens import DocBin, Doc
 
 from spacy.cli._util import app, get_git_version
+from spacy.tokens import Doc, DocBin
+
 from .util import make_tempdir, normalize_whitespace
 
 
@@ -103,6 +105,8 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
 
 # project tests
 
+CFG_FILE = "myconfig.cfg"
+
 SAMPLE_PROJECT = {
     "title": "Sample project",
     "description": "This is a project for testing",
@@ -128,13 +132,8 @@ SAMPLE_PROJECT = {
         {
             "name": "create",
             "help": "make a file",
-            "script": ["touch abc.txt"],
-            "outputs": ["abc.txt"],
-        },
-        {
-            "name": "clean",
-            "help": "remove test file",
-            "script": ["rm abc.txt"],
+            "script": [f"python -m spacy init config {CFG_FILE}"],
+            "outputs": [f"{CFG_FILE}"],
         },
     ],
 }
@@ -175,7 +174,7 @@ def test_project_assets(project_dir):
 
 def test_project_run(project_dir):
     # make sure dry run works
-    test_file = project_dir / "abc.txt"
+    test_file = project_dir / CFG_FILE
     result = CliRunner().invoke(
         app, ["project", "run", "--dry", "create", str(project_dir)]
     )
@@ -223,14 +222,13 @@ def test_project_push_pull(project_dir):
         proj_text = srsly.yaml_dumps(proj)
         (project_dir / "project.yml").write_text(proj_text)
 
-        test_file = project_dir / "abc.txt"
+        test_file = project_dir / CFG_FILE
         result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
         assert result.exit_code == 0
         assert test_file.is_file()
         result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
         assert result.exit_code == 0
-        result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
-        assert result.exit_code == 0
+        test_file.unlink()
         assert not test_file.exists()
         result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
         assert result.exit_code == 0
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index f298b38e0..ce103068a 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -5,7 +5,7 @@ from spacy import displacy
 from spacy.displacy.render import DependencyRenderer, EntityRenderer
 from spacy.lang.en import English
 from spacy.lang.fa import Persian
-from spacy.tokens import Span, Doc
+from spacy.tokens import Doc, Span
 
 
 @pytest.mark.issue(2361)
@@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
         {"start": 2, "end": 3, "label": "det", "dir": "left"},
         {"start": 1, "end": 3, "label": "attr", "dir": "right"},
     ]
+    # Test that displacy.parse_deps converts Span to Doc
+    deps = displacy.parse_deps(doc[:])
+    assert isinstance(deps, dict)
+    assert deps["words"] == [
+        {"lemma": None, "text": words[0], "tag": pos[0]},
+        {"lemma": None, "text": words[1], "tag": pos[1]},
+        {"lemma": None, "text": words[2], "tag": pos[2]},
+        {"lemma": None, "text": words[3], "tag": pos[3]},
+    ]
+    assert deps["arcs"] == [
+        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+        {"start": 2, "end": 3, "label": "det", "dir": "left"},
+        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
+    ]
 
 
 def test_displacy_invalid_arcs():
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 236856dad..51eec3239 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -1,21 +1,22 @@
 import itertools
 import logging
 from unittest import mock
+
 import pytest
+from thinc.api import CupyOps, NumpyOps, get_current_ops
+
+import spacy
+from spacy.lang.de import German
+from spacy.lang.en import English
 from spacy.language import Language
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
-from spacy.vocab import Vocab
 from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error, find_matching_language
-import spacy
-from thinc.api import CupyOps, NumpyOps, get_current_ops
+from spacy.util import find_matching_language, ignore_error, raise_error, registry
+from spacy.vocab import Vocab
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
-
 try:
     import torch
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 618f17334..438f458ec 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,39 @@
-import pytest
-import os
 import ctypes
+import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
-from spacy.training.batchers import minibatch_by_words
+
+from spacy import prefer_gpu, require_cpu, require_gpu, util
+from spacy.about import __version__ as spacy_version
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.ml._precomputable_affine import (
+    PrecomputableAffine,
+    _backprop_precomputable_affine_padding,
+)
 from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
-from pydantic import ValidationError
-
+from spacy.training.batchers import minibatch_by_words
+from spacy.util import (
+    SimpleFrozenList,
+    dot_to_object,
+    find_available_port,
+    import_file,
+    to_ternary_int,
+)
 
 from .util import get_random_doc, make_tempdir
 
@@ -237,6 +252,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
             {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
             {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
         ),
+        (
+            {"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
+            {"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
+        ),
     ],
 )
 def test_dot_to_dict(dot_notation, expected):
@@ -245,6 +264,29 @@ def test_dot_to_dict(dot_notation, expected):
     assert util.dict_to_dot(result) == dot_notation
 
 
+@pytest.mark.parametrize(
+    "dot_notation,expected",
+    [
+        (
+            {"token.pos": True, "token._.xyz": True},
+            {"token": {"pos": True, "_": {"xyz": True}}},
+        ),
+        (
+            {"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
+            {"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
+        ),
+        (
+            {"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
+            {"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
+        ),
+    ],
+)
+def test_dot_to_dict_overrides(dot_notation, expected):
+    result = util.dot_to_dict(dot_notation)
+    assert result == expected
+    assert util.dict_to_dot(result, for_overrides=True) == dot_notation
+
+
 def test_set_dot_to_object():
     config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
     with pytest.raises(KeyError):
@@ -441,7 +483,7 @@ def test_find_available_port():
     port = 5000
     assert find_available_port(port, host) == port, "Port 5000 isn't free"
 
-    from wsgiref.simple_server import make_server, demo_app
+    from wsgiref.simple_server import demo_app, make_server
 
     with make_server(host, port, demo_app) as httpd:
         with pytest.warns(UserWarning, match="already in use"):
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index d91ed1201..e6692ad92 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -1,16 +1,31 @@
 from typing import List
-import pytest
-from thinc.api import fix_random_seed, Adam, set_dropout_rate
-from thinc.api import Ragged, reduce_mean, Logistic, chain, Relu
-from numpy.testing import assert_array_equal, assert_array_almost_equal
+
 import numpy
-from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
-from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
-from spacy.ml.models import build_spancat_model
-from spacy.ml.staticvectors import StaticVectors
-from spacy.ml.extract_spans import extract_spans, _get_span_indices
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+from thinc.api import (
+    Adam,
+    Logistic,
+    Ragged,
+    Relu,
+    chain,
+    fix_random_seed,
+    reduce_mean,
+    set_dropout_rate,
+)
+
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES
+from spacy.ml.extract_spans import _get_span_indices, extract_spans
+from spacy.ml.models import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_bow_text_classifier,
+    build_simple_cnn_text_classifier,
+    build_spancat_model,
+    build_Tok2Vec_model,
+)
+from spacy.ml.staticvectors import StaticVectors
 
 
 def get_textcat_bow_kwargs():
diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
index 0c56ae0d2..e3acd27a3 100644
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@@ -1,11 +1,12 @@
-import pytest
 import numpy
+import pytest
 import srsly
+
+from spacy.attrs import NORM
 from spacy.lang.en import English
 from spacy.strings import StringStore
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
-from spacy.attrs import NORM
 
 
 @pytest.mark.parametrize("text1,text2", [("hello", "bye")])
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index dbb47b423..95daf046c 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -1,13 +1,12 @@
-from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
+from numpy.testing import assert_almost_equal, assert_array_almost_equal
 from pytest import approx
+
+from spacy.lang.en import English
+from spacy.scorer import PRFScore, ROCAUCScore, Scorer, _roc_auc_score, _roc_curve
+from spacy.tokens import Doc, Span
 from spacy.training import Example
 from spacy.training.iob_utils import offsets_to_biluo_tags
-from spacy.scorer import Scorer, ROCAUCScore, PRFScore
-from spacy.scorer import _roc_auc_score, _roc_curve
-from spacy.lang.en import English
-from spacy.tokens import Doc, Span
-
 
 test_las_apple = [
     [
@@ -115,6 +114,14 @@ def test_tokenization(sented_doc):
     assert scores["token_r"] == approx(0.33333333)
     assert scores["token_f"] == 0.4
 
+    # per-component scoring
+    scorer = Scorer()
+    scores = scorer.score([example], per_component=True)
+    assert scores["tokenizer"]["token_acc"] == 0.5
+    assert scores["tokenizer"]["token_p"] == 0.5
+    assert scores["tokenizer"]["token_r"] == approx(0.33333333)
+    assert scores["tokenizer"]["token_f"] == 0.4
+
 
 def test_sents(sented_doc):
     scorer = Scorer()
@@ -278,6 +285,13 @@ def test_tag_score(tagged_doc):
     assert results["morph_per_feat"]["Poss"]["f"] == 0.0
     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
 
+    # per-component scoring
+    scorer = Scorer()
+    results = scorer.score([example], per_component=True)
+    assert results["tagger"]["tag_acc"] == 0.9
+    assert results["morphologizer"]["pos_acc"] == 0.9
+    assert results["morphologizer"]["morph_acc"] == approx(0.8)
+
 
 def test_partial_annotation(en_tokenizer):
     pred_doc = en_tokenizer("a b c d e")
@@ -423,14 +437,14 @@ def test_score_spans():
         return doc.spans[span_key]
 
     # Predict exactly the same, but overlapping spans will be discarded
-    pred.spans[key] = spans
+    pred.spans[key] = gold.spans[key].copy(doc=pred)
     eg = Example(pred, gold)
     scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
     assert scores[f"{key}_p"] == 1.0
     assert scores[f"{key}_r"] < 1.0
 
     # Allow overlapping, now both precision and recall should be 100%
-    pred.spans[key] = spans
+    pred.spans[key] = gold.spans[key].copy(doc=pred)
     eg = Example(pred, gold)
     scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
     assert scores[f"{key}_p"] == 1.0
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index 85716377a..1f8f52c79 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -1,4 +1,5 @@
 import sys
+
 import pytest
 
 
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 6af58b344..1ea5f78c9 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -3,15 +3,19 @@ import re
 import numpy
 import pytest
 
-from spacy.lang.en import English
 from spacy.lang.de import German
+from spacy.lang.en import English
+from spacy.symbols import ORTH
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.training import Example
-from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
-from spacy.util import compile_infix_regex
+from spacy.util import (
+    compile_infix_regex,
+    compile_prefix_regex,
+    compile_suffix_regex,
+    ensure_path,
+)
 from spacy.vocab import Vocab
-from spacy.symbols import ORTH
 
 
 @pytest.mark.issue(743)
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 57e970f87..ff8812be1 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -2,7 +2,6 @@ import pytest
 
 from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS
 
-
 URLS_BASIC = [
     "http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region&region=top-news&WT.nav=top-news&_r=0",
     "www.red-stars.com",
diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py
index 35860a199..49a83010b 100644
--- a/spacy/tests/training/test_augmenters.py
+++ b/spacy/tests/training/test_augmenters.py
@@ -1,13 +1,17 @@
-import pytest
-from spacy.pipeline._parser_internals.nonproj import contains_cycle
-from spacy.training import Corpus, Example
-from spacy.training.augment import create_orth_variants_augmenter
-from spacy.training.augment import create_lower_casing_augmenter
-from spacy.training.augment import make_whitespace_variant
-from spacy.lang.en import English
-from spacy.tokens import DocBin, Doc, Span
-from contextlib import contextmanager
 import random
+from contextlib import contextmanager
+
+import pytest
+
+from spacy.lang.en import English
+from spacy.pipeline._parser_internals.nonproj import contains_cycle
+from spacy.tokens import Doc, DocBin, Span
+from spacy.training import Corpus, Example
+from spacy.training.augment import (
+    create_lower_casing_augmenter,
+    create_orth_variants_augmenter,
+    make_whitespace_variant,
+)
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/training/test_corpus.py b/spacy/tests/training/test_corpus.py
index b4f9cc13a..e7cae9893 100644
--- a/spacy/tests/training/test_corpus.py
+++ b/spacy/tests/training/test_corpus.py
@@ -1,8 +1,9 @@
-from typing import IO, Generator, Iterable, List, TextIO, Tuple
+import tempfile
 from contextlib import contextmanager
 from pathlib import Path
+from typing import IO, Generator, Iterable, List, TextIO, Tuple
+
 import pytest
-import tempfile
 
 from spacy.lang.en import English
 from spacy.training import Example, PlainTextCorpus
diff --git a/spacy/tests/training/test_logger.py b/spacy/tests/training/test_logger.py
index 0dfd0cbf4..48750026b 100644
--- a/spacy/tests/training/test_logger.py
+++ b/spacy/tests/training/test_logger.py
@@ -1,6 +1,6 @@
 import pytest
-import spacy
 
+import spacy
 from spacy.training import loggers
 
 
diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py
index 6b15603b3..88f819984 100644
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -1,8 +1,9 @@
 import pytest
-from spacy.training.example import Example
+
 from spacy.tokens import Doc
-from spacy.vocab import Vocab
+from spacy.training.example import Example
 from spacy.util import to_ternary_int
+from spacy.vocab import Vocab
 
 
 def test_Example_init_requires_doc_objects():
diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
index c0d64f1e7..5e5f94622 100644
--- a/spacy/tests/training/test_pretraining.py
+++ b/spacy/tests/training/test_pretraining.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+
 import numpy as np
 import pytest
 import srsly
@@ -6,14 +7,15 @@ from thinc.api import Config, get_current_ops
 
 from spacy import util
 from spacy.lang.en import English
+from spacy.language import DEFAULT_CONFIG_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.ml.models.multi_task import create_pretrain_vectors
+from spacy.tokens import Doc, DocBin
 from spacy.training.initialize import init_nlp
 from spacy.training.loop import train
 from spacy.training.pretrain import pretrain
-from spacy.tokens import Doc, DocBin
-from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
-from spacy.ml.models.multi_task import create_pretrain_vectors
 from spacy.vectors import Vectors
 from spacy.vocab import Vocab
+
 from ..util import make_tempdir
 
 pretrain_string_listener = """
@@ -165,7 +167,8 @@ def test_pretraining_default():
 
 
 @pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
-def test_pretraining_tok2vec_characters(objective):
+@pytest.mark.parametrize("skip_last", (True, False))
+def test_pretraining_tok2vec_characters(objective, skip_last):
     """Test that pretraining works with the character objective"""
     config = Config().from_str(pretrain_string_listener)
     config["pretraining"]["objective"] = objective
@@ -178,10 +181,14 @@ def test_pretraining_tok2vec_characters(objective):
         filled["paths"]["raw_text"] = file_path
         filled = filled.interpolate()
         assert filled["pretraining"]["component"] == "tok2vec"
-        pretrain(filled, tmp_dir)
+        pretrain(filled, tmp_dir, skip_last=skip_last)
         assert Path(tmp_dir / "model0.bin").exists()
         assert Path(tmp_dir / "model4.bin").exists()
         assert not Path(tmp_dir / "model5.bin").exists()
+        if skip_last:
+            assert not Path(tmp_dir / "model-last.bin").exists()
+        else:
+            assert Path(tmp_dir / "model-last.bin").exists()
 
 
 @pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
@@ -237,6 +244,7 @@ def test_pretraining_tagger_tok2vec(config):
         pretrain(filled, tmp_dir)
         assert Path(tmp_dir / "model0.bin").exists()
         assert Path(tmp_dir / "model4.bin").exists()
+        assert Path(tmp_dir / "model-last.bin").exists()
         assert not Path(tmp_dir / "model5.bin").exists()
 
 
@@ -359,19 +367,15 @@ def test_pretrain_default_vectors():
     nlp.vocab.vectors = Vectors(shape=(10, 10))
     create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
 
+    # floret vectors are supported
+    nlp.vocab.vectors = Vectors(
+        data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
+    )
+    create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
+
     # error for no vectors
     with pytest.raises(ValueError, match="E875"):
         nlp.vocab.vectors = Vectors()
         create_pretrain_vectors(1, 1, "cosine")(
             nlp.vocab, nlp.get_pipe("tok2vec").model
         )
-
-    # error for floret vectors
-    with pytest.raises(ValueError, match="E850"):
-        ops = get_current_ops()
-        nlp.vocab.vectors = Vectors(
-            data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
-        )
-        create_pretrain_vectors(1, 1, "cosine")(
-            nlp.vocab, nlp.get_pipe("tok2vec").model
-        )
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 8c5c81625..22cf75272 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,10 +1,12 @@
-from typing import Dict, Iterable, Callable
+from typing import Callable, Dict, Iterable
+
 import pytest
 from thinc.api import Config, fix_random_seed
+
 from spacy import Language
-from spacy.util import load_model_from_config, registry, resolve_dot_names
 from spacy.schemas import ConfigSchemaTraining
 from spacy.training import Example
+from spacy.util import load_model_from_config, registry, resolve_dot_names
 
 
 def test_readers():
diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py
index 5ac7fc217..7efe57a36 100644
--- a/spacy/tests/training/test_rehearse.py
+++ b/spacy/tests/training/test_rehearse.py
@@ -1,9 +1,9 @@
-import pytest
-import spacy
-
 from typing import List
-from spacy.training import Example
 
+import pytest
+
+import spacy
+from spacy.training import Example
 
 TRAIN_DATA = [
     (
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 7933ea31f..a492a8be3 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -2,20 +2,32 @@ import random
 
 import numpy
 import pytest
-import spacy
 import srsly
+from thinc.api import Adam, compounding
+
+import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
-from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
-from spacy.training import offsets_to_biluo_tags
-from spacy.training.alignment_array import AlignmentArray
+from spacy.training import (
+    Alignment,
+    Corpus,
+    Example,
+    biluo_tags_to_offsets,
+    biluo_tags_to_spans,
+    docs_to_json,
+    iob_to_biluo,
+    offsets_to_biluo_tags,
+)
 from spacy.training.align import get_alignments
+from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
 from spacy.training.loop import train_while_improving
-from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
-from spacy.util import load_config_from_str
-from thinc.api import compounding, Adam
+from spacy.util import (
+    get_words_and_spaces,
+    load_config_from_str,
+    load_model_from_path,
+    minibatch,
+)
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index c2647558d..a5548898c 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -1,14 +1,16 @@
-import numpy
-import tempfile
 import contextlib
 import re
+import tempfile
+
+import numpy
 import srsly
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-from spacy.util import make_tempdir  # noqa: F401
-from spacy.training import split_bilu_label
 from thinc.api import get_current_ops
 
+from spacy.tokens import Doc
+from spacy.training import split_bilu_label
+from spacy.util import make_tempdir  # noqa: F401
+from spacy.vocab import Vocab
+
 
 @contextlib.contextmanager
 def make_tempfile(mode="r"):
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index d91f41db3..156e3391a 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -1,5 +1,6 @@
 import numpy
 import pytest
+
 from spacy.attrs import IS_ALPHA, IS_DIGIT
 from spacy.lookups import Lookups
 from spacy.tokens import Doc
diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py
index 94e31a072..addd3fe4f 100644
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.lookups import Lookups, Table
 from spacy.strings import get_string_id
 from spacy.vocab import Vocab
diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index 1efcdd81e..5a28f5414 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -1,9 +1,10 @@
-import pytest
 import numpy
+import pytest
+
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
-from ..util import get_cosine, add_vecs_to_vocab
+from ..util import add_vecs_to_vocab, get_cosine
 
 
 @pytest.fixture
diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py
index a0f8016af..61039fffd 100644
--- a/spacy/tests/vocab_vectors/test_stringstore.py
+++ b/spacy/tests/vocab_vectors/test_stringstore.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.strings import StringStore
 
 
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 70835816d..717291314 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -402,6 +402,7 @@ def test_vectors_serialize():
         row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
         assert row == row_r
         assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
+        assert v.attr == v_r.attr
 
 
 def test_vector_is_oov():
@@ -646,3 +647,32 @@ def test_equality():
     vectors1.resize((5, 9))
     vectors2.resize((5, 9))
     assert vectors1 == vectors2
+
+
+def test_vectors_attr():
+    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
+    # default ORTH
+    nlp = English()
+    nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"])
+    assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row
+    assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row
+    assert nlp.vocab["A"].has_vector is True
+    assert nlp.vocab["a"].has_vector is False
+    assert nlp("A")[0].has_vector is True
+    assert nlp("a")[0].has_vector is False
+
+    # custom LOWER
+    nlp = English()
+    nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER")
+    assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row
+    assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row
+    assert nlp.vocab["A"].has_vector is True
+    assert nlp.vocab["a"].has_vector is True
+    assert nlp("A")[0].has_vector is True
+    assert nlp("a")[0].has_vector is True
+    # add a new vectors entry
+    assert nlp.vocab["D"].has_vector is False
+    assert nlp.vocab["d"].has_vector is False
+    nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6]))
+    assert nlp.vocab["D"].has_vector is True
+    assert nlp.vocab["d"].has_vector is True
diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py
index b9c386eb8..e373b9d0b 100644
--- a/spacy/tests/vocab_vectors/test_vocab_api.py
+++ b/spacy/tests/vocab_vectors/test_vocab_api.py
@@ -1,6 +1,7 @@
 import os
 
 import pytest
+
 from spacy.attrs import IS_ALPHA, LEMMA, ORTH
 from spacy.lang.en import English
 from spacy.parts_of_speech import NOUN, VERB
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index e6a072053..f7585b45a 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -1,13 +1,13 @@
+from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
-from cymem.cymem cimport Pool
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .strings cimport StringStore
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
+from .strings cimport StringStore
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .typedefs cimport hash_t
+from .vocab cimport LexemesOrTokens, Vocab, _Cached
 
 
 cdef class Tokenizer:
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 0e75b5f7a..3861b1cee 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,29 +1,27 @@
 # cython: embedsignature=True, profile=True, binding=True
+cimport cython
+from cymem.cymem cimport Pool
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from libc.string cimport memcpy, memset
 from libcpp.set cimport set as stdset
-from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
-cimport cython
 
 import re
 import warnings
 
-from .tokens.doc cimport Doc
-from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
+from .strings cimport hash_string
+from .tokens.doc cimport Doc
 
-from .attrs import intify_attrs
-from .symbols import ORTH, NORM
-from .errors import Errors, Warnings
 from . import util
-from .util import registry, get_words_and_spaces
 from .attrs import intify_attrs
-from .symbols import ORTH
+from .errors import Errors, Warnings
 from .scorer import Scorer
-from .training import validate_examples
+from .symbols import NORM, ORTH
 from .tokens import Span
+from .training import validate_examples
+from .util import get_words_and_spaces, registry
 
 
 cdef class Tokenizer:
@@ -834,10 +832,12 @@ cdef class Tokenizer:
             self.token_match = re.compile(data["token_match"]).match
         if "url_match" in data and isinstance(data["url_match"], str):
             self.url_match = re.compile(data["url_match"]).match
-        if "rules" in data and isinstance(data["rules"], dict):
-            self.rules = data["rules"]
         if "faster_heuristics" in data:
             self.faster_heuristics = data["faster_heuristics"]
+        # always load rules last so that all other settings are set before the
+        # internal tokenization for the phrase matcher
+        if "rules" in data and isinstance(data["rules"], dict):
+            self.rules = data["rules"]
         return self
 
 
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 64090925d..f4b2bf022 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -1,8 +1,8 @@
+from ._serialize import DocBin
 from .doc import Doc
-from .token import Token
+from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from ._serialize import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"]
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py
index 6edcce13d..b2b496307 100644
--- a/spacy/tokens/_dict_proxies.py
+++ b/spacy/tokens/_dict_proxies.py
@@ -1,12 +1,12 @@
-from typing import Dict, Iterable, List, Tuple, Union, Optional, TYPE_CHECKING
 import warnings
 import weakref
 from collections import UserDict
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
 import srsly
 
-from .span_group import SpanGroup
 from ..errors import Errors, Warnings
-
+from .span_group import SpanGroup
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/_retokenize.pyi
index 8834d38c0..097fbd1a9 100644
--- a/spacy/tokens/_retokenize.pyi
+++ b/spacy/tokens/_retokenize.pyi
@@ -1,8 +1,9 @@
-from typing import Dict, Any, Union, List, Tuple
+from typing import Any, Dict, List, Tuple, Union
+
+from .. import Vocab
 from .doc import Doc
 from .span import Span
 from .token import Token
-from .. import Vocab
 
 class Retokenizer:
     def __init__(self, doc: Doc) -> None: ...
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 43e6d4aa7..8ed707ab9 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -1,24 +1,24 @@
 # cython: infer_types=True, bounds_check=False, profile=True
-from libc.string cimport memcpy, memset
-from libc.stdlib cimport malloc, free
 from cymem.cymem cimport Pool
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memset
 
-from thinc.api import get_array_module
 import numpy
+from thinc.api import get_array_module
 
-from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
+from ..attrs cimport MORPH, NORM
+from ..lexeme cimport EMPTY_LEXEME, Lexeme
+from ..structs cimport LexemeC, TokenC
+from ..vocab cimport Vocab
+from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
 from .span cimport Span
 from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..structs cimport LexemeC, TokenC
-from ..attrs cimport MORPH, NORM
-from ..vocab cimport Vocab
 
-from .underscore import is_writable_attr
 from ..attrs import intify_attrs
-from ..util import SimpleFrozenDict
 from ..errors import Errors
 from ..strings import get_string_id
+from ..util import SimpleFrozenDict
+from .underscore import is_writable_attr
 
 
 cdef class Retokenizer:
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index c4e8f26f4..873d85835 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,22 +1,20 @@
-from typing import List, Dict, Set, Iterable, Iterator, Union, Optional
-from pathlib import Path
-import numpy
-from numpy import ndarray
 import zlib
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Optional, Set, Union
+
+import numpy
 import srsly
+from numpy import ndarray
 from thinc.api import NumpyOps
 
-from .doc import Doc
-from ..vocab import Vocab
+from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
-from ..attrs import SPACY, ORTH, intify_attr, IDS
 from ..errors import Errors
-from ..util import ensure_path, SimpleFrozenList
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab import Vocab
 from ._dict_proxies import SpanGroups
-
-# fmt: off
-ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
-# fmt: on
+from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
+from .doc import Doc
 
 
 class DocBin:
@@ -124,6 +122,10 @@ class DocBin:
         for key, group in doc.spans.items():
             for span in group:
                 self.strings.add(span.label_)
+                if span.kb_id in span.doc.vocab.strings:
+                    self.strings.add(span.kb_id_)
+                if span.id in span.doc.vocab.strings:
+                    self.strings.add(span.id_)
 
     def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
         """Recover Doc objects from the annotations, using the given vocab.
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 57d087958..d7f092c94 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -1,10 +1,10 @@
-from cymem.cymem cimport Pool
 cimport numpy as np
+from cymem.cymem cimport Pool
 
-from ..vocab cimport Vocab
-from ..structs cimport TokenC, LexemeC, SpanC
-from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
+from ..structs cimport LexemeC, SpanC, TokenC
+from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
 
 
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 9d45960ab..00c7a9d07 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -1,16 +1,31 @@
-from typing import Callable, Protocol, Iterable, Iterator, Optional
-from typing import Union, Tuple, List, Dict, Any, overload
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    overload,
+)
+
+import numpy as np
 from cymem.cymem import Pool
 from thinc.types import Floats1d, Floats2d, Ints2d
-from .span import Span
-from .token import Token
-from ._dict_proxies import SpanGroups
-from ._retokenize import Retokenizer
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
+from ._dict_proxies import SpanGroups
+from ._retokenize import Retokenizer
+from .span import Span
+from .token import Token
 from .underscore import Underscore
-from pathlib import Path
-import numpy as np
+
+DOCBIN_ALL_ATTRS: Tuple[str, ...]
 
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7dfe0ca9f..146b276e2 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -3,45 +3,68 @@ from typing import Set
 
 cimport cython
 cimport numpy as np
-from libc.string cimport memcpy
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
+from libc.string cimport memcpy
 
 import copy
+import itertools
+import warnings
 from collections import Counter, defaultdict
 from enum import Enum
-import itertools
+
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
 from thinc.util import copy_array
-import warnings
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-from ._dict_proxies import SpanGroups
-from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport attr_id_t
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ..attrs import intify_attr, IDS
+from ._dict_proxies import SpanGroups
+
+from ..attrs cimport (
+    DEP,
+    ENT_ID,
+    ENT_IOB,
+    ENT_KB_ID,
+    ENT_TYPE,
+    HEAD,
+    IDX,
+    LEMMA,
+    LENGTH,
+    MORPH,
+    NORM,
+    ORTH,
+    POS,
+    SENT_START,
+    SPACY,
+    TAG,
+    attr_id_t,
+)
+from ..lexeme cimport EMPTY_LEXEME, Lexeme
+from ..typedefs cimport attr_t, flags_t
+from .token cimport Token
+
+from .. import parts_of_speech, schemas, util
+from ..attrs import IDS, intify_attr
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
-from .. import util
-from .. import parts_of_speech
-from .. import schemas
-from .underscore import Underscore, get_ext_args
-from ._retokenize import Retokenizer
-from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
+from ._retokenize import Retokenizer
+from .underscore import Underscore, get_ext_args
 
 DEF PADDING = 5
 
 
+# We store the docbin attrs here rather than in _serialize to avoid
+# import cycles.
+
+# fmt: off
+DOCBIN_ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
+# fmt: on
+
 cdef int bounds_check(int i, int length, int padding) except -1:
     if (i + padding) < 0:
         raise IndexError(Errors.E026.format(i=i, length=length))
@@ -544,10 +567,6 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#char_span
         """
-        if not isinstance(label, int):
-            label = self.vocab.strings.add(label)
-        if not isinstance(kb_id, int):
-            kb_id = self.vocab.strings.add(kb_id)
         alignment_modes = ("strict", "contract", "expand")
         if alignment_mode not in alignment_modes:
             raise ValueError(
@@ -595,13 +614,26 @@ cdef class Doc:
         """
         if "similarity" in self.user_hooks:
             return self.user_hooks["similarity"](self, other)
-        if isinstance(other, (Lexeme, Token)) and self.length == 1:
-            if self.c[0].lex.orth == other.orth:
+        attr = getattr(self.vocab.vectors, "attr", ORTH)
+        cdef Token this_token
+        cdef Token other_token
+        cdef Lexeme other_lex
+        if len(self) == 1 and isinstance(other, Token):
+            this_token = self[0]
+            other_token = other
+            if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
                 return 1.0
-        elif isinstance(other, (Span, Doc)) and len(self) == len(other):
+        elif len(self) == 1 and isinstance(other, Lexeme):
+            this_token = self[0]
+            other_lex = other
+            if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
+                return 1.0
+        elif isinstance(other, (Doc, Span)) and len(self) == len(other):
             similar = True
-            for i in range(self.length):
-                if self[i].orth != other[i].orth:
+            for i in range(len(self)):
+                this_token = self[i]
+                other_token = other[i]
+                if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
                     similar = False
                     break
             if similar:
@@ -1268,12 +1300,14 @@ cdef class Doc:
         other.user_span_hooks = dict(self.user_span_hooks)
         other.length = self.length
         other.max_length = self.max_length
-        other.spans = self.spans.copy(doc=other)
         buff_size = other.max_length + (PADDING*2)
         assert buff_size > 0
         tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
         memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
         other.c = &tokens[PADDING]
+        # copy spans after setting tokens so that SpanGroup.copy can verify
+        # that the start/end offsets are valid
+        other.spans = self.spans.copy(doc=other)
         return other
 
     def to_disk(self, path, *, exclude=tuple()):
@@ -1350,6 +1384,10 @@ cdef class Doc:
         for group in self.spans.values():
             for span in group:
                 strings.add(span.label_)
+                if span.kb_id in span.doc.vocab.strings:
+                    strings.add(span.kb_id_)
+                if span.id in span.doc.vocab.strings:
+                    strings.add(span.id_)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
         # keys, we must have tuples. In values we just have to hope
diff --git a/spacy/tokens/graph.pxd b/spacy/tokens/graph.pxd
index 6f2f80656..083ef6522 100644
--- a/spacy/tokens/graph.pxd
+++ b/spacy/tokens/graph.pxd
@@ -1,7 +1,8 @@
-from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
+from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
-from ..structs cimport GraphC, EdgeC
+
+from ..structs cimport EdgeC, GraphC
 
 
 cdef class Graph:
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index adc4d23c8..47f0a20d4 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,19 +1,26 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
-from typing import List, Tuple, Generator
+from typing import Generator, List, Tuple
+
+cimport cython
+from cython.operator cimport dereference
 from libc.stdint cimport int32_t, int64_t
 from libcpp.pair cimport pair
 from libcpp.unordered_map cimport unordered_map
 from libcpp.unordered_set cimport unordered_set
-from cython.operator cimport dereference
-cimport cython
+
 import weakref
-from preshed.maps cimport map_get_unless_missing
+
 from murmurhash.mrmr cimport hash64
+from preshed.maps cimport map_get_unless_missing
 
 from .. import Errors
+
 from ..typedefs cimport hash_t
+
 from ..strings import get_string_id
+
 from ..structs cimport EdgeC, GraphC
+
 from .token import Token
 
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 9510875c9..728f0aaf7 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,6 +1,6 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
 from ..structs cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
 
 
 cdef class MorphAnalysis:
diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi
index b86203cc4..b35ff36aa 100644
--- a/spacy/tokens/morphanalysis.pyi
+++ b/spacy/tokens/morphanalysis.pyi
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Iterator, List, Union
+from typing import Any, Dict, Iterator, List, Optional, Union
+
 from ..vocab import Vocab
 
 class MorphAnalysis:
@@ -13,7 +14,7 @@ class MorphAnalysis:
     def __hash__(self) -> int: ...
     def __eq__(self, other: MorphAnalysis) -> bool: ...  # type: ignore[override]
     def __ne__(self, other: MorphAnalysis) -> bool: ...  # type: ignore[override]
-    def get(self, field: Any) -> List[str]: ...
+    def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
     def to_json(self) -> str: ...
     def to_dict(self) -> Dict[str, str]: ...
     def __str__(self) -> str: ...
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index a7d1f2e44..0992a0b66 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,11 +1,12 @@
-from libc.string cimport memset
 cimport numpy as np
+from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
+
+from ..morphology cimport check_feature, get_by_field, list_features
+from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field
 
 
 cdef class MorphAnalysis:
@@ -58,10 +59,14 @@ cdef class MorphAnalysis:
     def __ne__(self, other):
         return self.key != other.key
 
-    def get(self, field):
+    def get(self, field, default=None):
         """Retrieve feature values by field."""
         cdef attr_t field_id = self.vocab.strings.as_int(field)
         cdef np.ndarray results = get_by_field(&self.c, field_id)
+        if len(results) == 0:
+            if default is None:
+                default = []
+            return default
         features = [self.vocab.strings[result] for result in results]
         return [f.split(Morphology.FIELD_SEP)[1] for f in features]
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 78bee0a8c..d77bbea70 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 
-from .doc cimport Doc
-from ..typedefs cimport attr_t
 from ..structs cimport SpanC
+from ..typedefs cimport attr_t
+from .doc cimport Doc
 
 
 cdef class Span:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index a92f19e20..b982eb810 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -1,10 +1,12 @@
-from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload
-from thinc.types import Floats1d, Ints2d, FloatsXd
+from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union, overload
+
+from thinc.types import Floats1d, FloatsXd, Ints2d
+
+from ..lexeme import Lexeme
+from ..vocab import Vocab
 from .doc import Doc
 from .token import Token
 from .underscore import Underscore
-from ..lexeme import Lexeme
-from ..vocab import Vocab
 
 class SpanMethod(Protocol):
     def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
@@ -51,7 +53,12 @@ class Span:
         kb_id: Union[str, int] = ...,
         span_id: Union[str, int] = ...,
     ) -> None: ...
-    def __richcmp__(self, other: Span, op: int) -> bool: ...
+    def __lt__(self, other: Any) -> bool: ...
+    def __le__(self, other: Any) -> bool: ...
+    def __eq__(self, other: Any) -> bool: ...
+    def __ne__(self, other: Any) -> bool: ...
+    def __gt__(self, other: Any) -> bool: ...
+    def __ge__(self, other: Any) -> bool: ...
     def __hash__(self) -> int: ...
     def __len__(self) -> int: ...
     def __repr__(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 7750b16ed..59ee21687 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,22 +1,24 @@
 cimport numpy as np
 from libc.math cimport sqrt
 
+import copy
+import warnings
+
 import numpy
 from thinc.api import get_array_module
-import warnings
-import copy
 
-from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
-from ..structs cimport TokenC, LexemeC
-from ..typedefs cimport flags_t, attr_t, hash_t
-from ..attrs cimport attr_id_t
-from ..parts_of_speech cimport univ_pos_t
 from ..attrs cimport *
+from ..attrs cimport ORTH, attr_id_t
 from ..lexeme cimport Lexeme
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
+from ..typedefs cimport attr_t, flags_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from .token cimport Token
 
-from ..util import normalize_slice
 from ..errors import Errors, Warnings
+from ..util import normalize_slice
 from .underscore import Underscore, get_ext_args
 
 
@@ -340,13 +342,26 @@ cdef class Span:
         """
         if "similarity" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["similarity"](self, other)
-        if len(self) == 1 and hasattr(other, "orth"):
-            if self[0].orth == other.orth:
+        attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
+        cdef Token this_token
+        cdef Token other_token
+        cdef Lexeme other_lex
+        if len(self) == 1 and isinstance(other, Token):
+            this_token = self[0]
+            other_token = other
+            if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
+                return 1.0
+        elif len(self) == 1 and isinstance(other, Lexeme):
+            this_token = self[0]
+            other_lex = other
+            if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
                 return 1.0
         elif isinstance(other, (Doc, Span)) and len(self) == len(other):
             similar = True
             for i in range(len(self)):
-                if self[i].orth != getattr(other[i], "orth", None):
+                this_token = self[i]
+                other_token = other[i]
+                if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
                     similar = False
                     break
             if similar:
@@ -463,6 +478,10 @@ cdef class Span:
                 elif i == self.doc.length - 1:
                     yield Span(self.doc, start, self.doc.length)
 
+            # Ensure that trailing parts of the Span instance are included in last element of .sents.
+            if start == self.doc.length - 1:
+                yield Span(self.doc, start, self.doc.length)
+
     @property
     def ents(self):
         """The named entities that fall completely within the span. Returns
diff --git a/spacy/tokens/span_group.pxd b/spacy/tokens/span_group.pxd
index 5074aa275..7f4145682 100644
--- a/spacy/tokens/span_group.pxd
+++ b/spacy/tokens/span_group.pxd
@@ -1,6 +1,8 @@
 from libcpp.vector cimport vector
+
 from ..structs cimport SpanC
 
+
 cdef class SpanGroup:
     cdef public object _doc_ref
     cdef public str name
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
index 0b4aa83aa..d063bb595 100644
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Iterator, Optional
+
 from .doc import Doc
 from .span import Span
 
@@ -18,7 +19,7 @@ class SpanGroup:
     def doc(self) -> Doc: ...
     @property
     def has_overlap(self) -> bool: ...
-    def __iter__(self): ...
+    def __iter__(self) -> Iterator[Span]: ...
     def __len__(self) -> int: ...
     def append(self, span: Span) -> None: ...
     def extend(self, spans: Iterable[Span]) -> None: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 608dda283..48ad4a516 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,10 +1,12 @@
-from typing import Iterable, Tuple, Union, Optional, TYPE_CHECKING
-import weakref
 import struct
+import weakref
 from copy import deepcopy
+from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
+
 import srsly
 
 from spacy.errors import Errors
+
 from .span cimport Span
 
 
@@ -52,6 +54,8 @@ cdef class SpanGroup:
         if len(spans) :
             self.c.reserve(len(spans))
         for span in spans:
+            if doc is not span.doc:
+                raise ValueError(Errors.E855.format(obj="span"))
             self.push_back(span.c)
 
     def __repr__(self):
@@ -261,11 +265,22 @@ cdef class SpanGroup:
         """
         if doc is None:
             doc = self.doc
+        if doc is self.doc:
+            spans = list(self)
+        else:
+            spans = [doc.char_span(span.start_char, span.end_char, label=span.label_, kb_id=span.kb_id, span_id=span.id) for span in self]
+            for i, span in enumerate(spans):
+                if span is None:
+                    raise ValueError(Errors.E1052.format(i=i))
+                if span.kb_id in self.doc.vocab.strings:
+                    doc.vocab.strings.add(span.kb_id_)
+                if span.id in span.doc.vocab.strings:
+                    doc.vocab.strings.add(span.id_)
         return SpanGroup(
             doc,
             name=self.name,
             attrs=deepcopy(self.attrs),
-            spans=list(self),
+            spans=spans,
         )
 
     def _concat(
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index 58b727764..fc02ff624 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -1,14 +1,16 @@
 from numpy cimport ndarray
-from ..vocab cimport Vocab
-from ..structs cimport TokenC
+
 from ..attrs cimport *
-from ..typedefs cimport attr_t, flags_t
-from ..parts_of_speech cimport univ_pos_t
-from .doc cimport Doc
 from ..lexeme cimport Lexeme
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport TokenC
+from ..typedefs cimport attr_t, flags_t
+from ..vocab cimport Vocab
+from .doc cimport Doc
 
 from ..errors import Errors
 
+
 cdef int MISSING_DEP = 0
 
 cdef class Token:
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
index bd585d034..e7863fd16 100644
--- a/spacy/tokens/token.pyi
+++ b/spacy/tokens/token.pyi
@@ -1,18 +1,12 @@
-from typing import (
-    Callable,
-    Protocol,
-    Iterator,
-    Optional,
-    Union,
-    Tuple,
-    Any,
-)
+from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union
+
 from thinc.types import Floats1d, FloatsXd
-from .doc import Doc
-from .span import Span
-from .morphanalysis import MorphAnalysis
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
+from .doc import Doc
+from .morphanalysis import MorphAnalysis
+from .span import Span
 from .underscore import Underscore
 
 class TokenMethod(Protocol):
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 7fff6b162..6018c3112 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,26 +1,44 @@
 # cython: infer_types=True
 # Compiler crashes on memory view coercion without this. Should report bug.
-from cython.view cimport array as cvarray
 cimport numpy as np
+from cython.view cimport array as cvarray
+
 np.import_array()
 
+import warnings
+
 import numpy
 from thinc.api import get_array_module
-import warnings
 
-from ..typedefs cimport hash_t
+from ..attrs cimport (
+    IS_ALPHA,
+    IS_ASCII,
+    IS_BRACKET,
+    IS_CURRENCY,
+    IS_DIGIT,
+    IS_LEFT_PUNCT,
+    IS_LOWER,
+    IS_PUNCT,
+    IS_QUOTE,
+    IS_RIGHT_PUNCT,
+    IS_SPACE,
+    IS_STOP,
+    IS_TITLE,
+    IS_UPPER,
+    LIKE_EMAIL,
+    LIKE_NUM,
+    LIKE_URL,
+    ORTH,
+)
 from ..lexeme cimport Lexeme
-from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
-from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
 from ..symbols cimport conj
-from .morphanalysis cimport MorphAnalysis
+from ..typedefs cimport hash_t
 from .doc cimport set_children_from_heads
+from .morphanalysis cimport MorphAnalysis
 
 from .. import parts_of_speech
-from ..errors import Errors, Warnings
 from ..attrs import IOB_STRINGS
+from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
 
 
@@ -197,11 +215,17 @@ cdef class Token:
         """
         if "similarity" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["similarity"](self, other)
-        if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
-            if self.c.lex.orth == getattr(other[0], "orth", None):
+        attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
+        cdef Token this_token = self
+        cdef Token other_token
+        cdef Lexeme other_lex
+        if isinstance(other, Token):
+            other_token = other
+            if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
                 return 1.0
-        elif hasattr(other, "orth"):
-            if self.c.lex.orth == other.orth:
+        elif isinstance(other, Lexeme):
+            other_lex = other
+            if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
                 return 1.0
         if self.vocab.vectors.n_keys == 0:
             warnings.warn(Warnings.W007.format(obj="Token"))
@@ -398,7 +422,7 @@ cdef class Token:
             return self.doc.user_token_hooks["has_vector"](self)
         if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
             return True
-        return self.vocab.has_vector(self.c.lex.orth)
+        return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 
     @property
     def vector(self):
@@ -414,7 +438,7 @@ cdef class Token:
         if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
             return self.doc.tensor[self.i]
         else:
-            return self.vocab.get_vector(self.c.lex.orth)
+            return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 
     @property
     def vector_norm(self):
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index e9a4e1862..0aa0c1e6d 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -1,6 +1,7 @@
-from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
-import functools
 import copy
+import functools
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
 from ..errors import Errors
 
 if TYPE_CHECKING:
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index a6f873f05..b8c0792f0 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,12 +1,18 @@
-from .corpus import Corpus, JsonlCorpus, PlainTextCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
-from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
-from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
-from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
-from .iob_utils import split_bilu_label, remove_bilu_prefix  # noqa: F401
-from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
-from .loggers import console_logger  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
+from .corpus import Corpus, JsonlCorpus, PlainTextCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .gold_io import docs_to_json, read_json_file  # noqa: F401
+from .iob_utils import (  # noqa: F401
+    biluo_tags_to_offsets,
+    biluo_tags_to_spans,
+    biluo_to_iob,
+    iob_to_biluo,
+    offsets_to_biluo_tags,
+    remove_bilu_prefix,
+    split_bilu_label,
+    tags_to_entities,
+)
+from .loggers import console_logger  # noqa: F401
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index 0ef1fd35d..8bd43b048 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,6 +1,6 @@
-from typing import List, Tuple
-from itertools import chain
 import re
+from itertools import chain
+from typing import List, Tuple
 
 from ..errors import Errors
 
diff --git a/spacy/training/alignment.py b/spacy/training/alignment.py
index 6d24714bf..3f615d10b 100644
--- a/spacy/training/alignment.py
+++ b/spacy/training/alignment.py
@@ -1,5 +1,5 @@
-from typing import List
 from dataclasses import dataclass
+from typing import List
 
 from .align import get_alignments
 from .alignment_array import AlignmentArray
diff --git a/spacy/training/alignment_array.pxd b/spacy/training/alignment_array.pxd
index 056f5bef3..bb28f3ac6 100644
--- a/spacy/training/alignment_array.pxd
+++ b/spacy/training/alignment_array.pxd
@@ -1,5 +1,6 @@
-from libcpp.vector cimport vector
 cimport numpy as np
+from libcpp.vector cimport vector
+
 
 cdef class AlignmentArray:
     cdef np.ndarray _data
diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx
index 01e9d9bf8..b0be1512b 100644
--- a/spacy/training/alignment_array.pyx
+++ b/spacy/training/alignment_array.pyx
@@ -1,6 +1,9 @@
 from typing import List
-from ..errors import Errors
+
 import numpy
+
+from ..errors import Errors
+
 from libc.stdint cimport int32_t
 
 
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 2fe8c24fb..1ebd3313c 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,12 +1,11 @@
-from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
-from typing import Optional
-import random
 import itertools
+import random
 from functools import partial
+from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, Optional, Tuple
 
 from ..util import registry
 from .example import Example
-from .iob_utils import split_bilu_label, _doc_to_biluo_tags_with_partial
+from .iob_utils import _doc_to_biluo_tags_with_partial, split_bilu_label
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index f0b6c3123..050c3351b 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,10 +1,18 @@
-from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator
-from typing import Optional, Any
-from functools import partial
 import itertools
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    TypeVar,
+    Union,
+)
 
-from ..util import registry, minibatch
-
+from ..util import minibatch, registry
 
 Sizing = Union[Sequence[int], int]
 ItemT = TypeVar("ItemT")
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 7e2494f5b..21c3d56a1 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,14 +1,17 @@
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional
+
 from ..errors import Errors
-from ..language import Language
-from ..util import load_model, registry, logger
+from ..util import load_model, logger, registry
+
+if TYPE_CHECKING:
+    from ..language import Language
 
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(
     tokenizer: Optional[str] = None,
     vocab: Optional[str] = None,
-) -> Callable[[Language], Language]:
+) -> Callable[["Language"], "Language"]:
     def copy_from_base_model(nlp):
         if tokenizer:
             logger.info("Copying tokenizer from: %s", tokenizer)
diff --git a/spacy/training/converters/__init__.py b/spacy/training/converters/__init__.py
index e91b6aaa6..8173da64c 100644
--- a/spacy/training/converters/__init__.py
+++ b/spacy/training/converters/__init__.py
@@ -1,4 +1,4 @@
-from .iob_to_docs import iob_to_docs  # noqa: F401
 from .conll_ner_to_docs import conll_ner_to_docs  # noqa: F401
-from .json_to_docs import json_to_docs  # noqa: F401
 from .conllu_to_docs import conllu_to_docs  # noqa: F401
+from .iob_to_docs import iob_to_docs  # noqa: F401
+from .json_to_docs import json_to_docs  # noqa: F401
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 28b21c5f0..b19d1791b 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -1,10 +1,10 @@
 from wasabi import Printer
 
-from .. import tags_to_entities
-from ...training import iob_to_biluo
-from ...tokens import Doc, Span
 from ...errors import Errors
-from ...util import load_model, get_lang_class
+from ...tokens import Doc, Span
+from ...training import iob_to_biluo
+from ...util import get_lang_class, load_model
+from .. import tags_to_entities
 
 
 def conll_ner_to_docs(
diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py
index 7052504cc..bda5c88c3 100644
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -1,11 +1,12 @@
 import re
 
-from .conll_ner_to_docs import n_sents_info
-from ...training import iob_to_biluo, biluo_tags_to_spans
-from ...tokens import Doc, Token, Span
-from ...vocab import Vocab
 from wasabi import Printer
 
+from ...tokens import Doc, Span, Token
+from ...training import biluo_tags_to_spans, iob_to_biluo
+from ...vocab import Vocab
+from .conll_ner_to_docs import n_sents_info
+
 
 def conllu_to_docs(
     input_data,
diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py
index 60fb7df61..45bb65692 100644
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -1,11 +1,11 @@
 from wasabi import Printer
 
-from .conll_ner_to_docs import n_sents_info
-from ...vocab import Vocab
-from ...training import iob_to_biluo, tags_to_entities
-from ...tokens import Doc, Span
 from ...errors import Errors
+from ...tokens import Doc, Span
+from ...training import iob_to_biluo, tags_to_entities
 from ...util import minibatch
+from ...vocab import Vocab
+from .conll_ner_to_docs import n_sents_info
 
 
 def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index 4123839f2..b4beedd2f 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,9 +1,13 @@
 import srsly
-from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations_to_doc
-from ..example import _fix_legacy_dict_data, _parse_example_dict_data
-from ...util import load_model
+
 from ...lang.xx import MultiLanguage
+from ...util import load_model
+from ..example import (
+    _fix_legacy_dict_data,
+    _parse_example_dict_data,
+    annotations_to_doc,
+)
+from ..gold_io import json_iterate, json_to_annotations
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 086ad831c..6037c15e3 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,16 +1,16 @@
-import warnings
-from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
-from typing import Optional
-from pathlib import Path
 import random
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Iterable, Iterator, List, Optional, Union
+
 import srsly
 
 from .. import util
+from ..errors import Errors, Warnings
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
 from .augment import dont_augment
 from .example import Example
-from ..errors import Warnings, Errors
-from ..tokens import DocBin, Doc
-from ..vocab import Vocab
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd
index 49e239757..a7c71fa88 100644
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@@ -1,6 +1,7 @@
-from ..tokens.doc cimport Doc
 from libc.stdint cimport uint64_t
 
+from ..tokens.doc cimport Doc
+
 
 cdef class Example:
     cdef readonly Doc x
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 95b0f0de9..abdac23ea 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,19 +1,29 @@
-from collections.abc import Iterable as IterableInstance
 import warnings
+from collections.abc import Iterable as IterableInstance
+
 import numpy
+
 from murmurhash.mrmr cimport hash64
 
 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
-from ..tokens.span import Span
+
 from ..attrs import IDS
-from .alignment import Alignment
-from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
-from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
+from ..tokens.span import Span
+from .alignment import Alignment
+from .iob_utils import (
+    biluo_tags_to_spans,
+    biluo_to_iob,
+    doc_to_biluo_tags,
+    offsets_to_biluo_tags,
+    remove_bilu_prefix,
+)
+
 from ..tokens.token cimport MISSING_DEP
-from ..util import logger, to_ternary_int, all_equal
+
+from ..util import all_equal, logger, to_ternary_int
 
 
 cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 69654e2c7..1e7b3681d 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,10 +1,12 @@
+import json
 import warnings
+
 import srsly
+
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
 from .iob_utils import offsets_to_biluo_tags, tags_to_entities
-import json
 
 
 def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index e90617852..82d4ebf24 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,24 +1,33 @@
-from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
-from thinc.api import Config, fix_random_seed, set_gpu_allocator
-from thinc.api import ConfigValidationError
-from pathlib import Path
-import srsly
-import numpy
-import tarfile
 import gzip
-import zipfile
-import tqdm
-from itertools import islice
+import tarfile
 import warnings
+import zipfile
+from itertools import islice
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, Dict, Optional, Union
+
+import numpy
+import srsly
+import tqdm
+from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
 
-from .pretrain import get_tok2vec_ref
-from ..lookups import Lookups
-from ..vectors import Vectors, Mode as VectorsMode
 from ..errors import Errors, Warnings
+from ..lookups import Lookups
 from ..schemas import ConfigSchemaTraining
-from ..util import registry, load_model_from_config, resolve_dot_names, logger
-from ..util import load_model, ensure_path, get_sourced_components
-from ..util import OOV_RANK, DEFAULT_OOV_PROB
+from ..util import (
+    DEFAULT_OOV_PROB,
+    OOV_RANK,
+    ensure_path,
+    get_sourced_components,
+    load_model,
+    load_model_from_config,
+    logger,
+    registry,
+    resolve_dot_names,
+)
+from ..vectors import Mode as VectorsMode
+from ..vectors import Vectors
+from .pretrain import get_tok2vec_ref
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -67,7 +76,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
         with nlp.select_pipes(enable=resume_components):
             logger.info("Resuming training for: %s", resume_components)
             nlp.resume_training(sgd=optimizer)
-    # Make sure that listeners are defined before initializing further
+    # Make sure that internal component names are synced and listeners are
+    # defined before initializing further
     nlp._link_components()
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         if T["max_epochs"] == -1:
@@ -133,10 +143,11 @@ def init_vocab(
         logger.info("Added vectors: %s", vectors)
     # warn if source model vectors are not identical
     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
-    vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
-    for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
-        if vectors_hash != sourced_vectors_hash:
-            warnings.warn(Warnings.W113.format(name=sourced_component))
+    if len(sourced_vectors_hashes) > 0:
+        vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
+        for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
+            if vectors_hash != sourced_vectors_hash:
+                warnings.warn(Warnings.W113.format(name=sourced_component))
     logger.info("Finished initializing nlp object")
 
 
@@ -205,9 +216,14 @@ def convert_vectors(
     prune: int,
     name: Optional[str] = None,
     mode: str = VectorsMode.default,
+    attr: str = "ORTH",
 ) -> None:
     vectors_loc = ensure_path(vectors_loc)
     if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
+        if attr != "ORTH":
+            raise ValueError(
+                "ORTH is the only attribute supported for vectors in .npz format."
+            )
         nlp.vocab.vectors = Vectors(
             strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
         )
@@ -235,11 +251,15 @@ def convert_vectors(
                 nlp.vocab.vectors = Vectors(
                     strings=nlp.vocab.strings,
                     data=vectors_data,
+                    attr=attr,
                     **floret_settings,
                 )
             else:
                 nlp.vocab.vectors = Vectors(
-                    strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
+                    strings=nlp.vocab.strings,
+                    data=vectors_data,
+                    keys=vector_keys,
+                    attr=attr,
                 )
                 nlp.vocab.deduplicate_vectors()
     if name is None:
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 0d4d246b0..64d02a1e2 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -1,8 +1,8 @@
-from typing import List, Dict, Tuple, Iterable, Union, Iterator, cast
 import warnings
+from typing import Dict, Iterable, Iterator, List, Tuple, Union, cast
 
 from ..errors import Errors, Warnings
-from ..tokens import Span, Doc
+from ..tokens import Doc, Span
 
 
 def iob_to_biluo(tags: Iterable[str]) -> List[str]:
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 7de31822e..1ec0b7b25 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -1,13 +1,14 @@
-from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union
-from wasabi import Printer
-from pathlib import Path
-import tqdm
 import sys
-import srsly
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+import srsly
+import tqdm
+from wasabi import Printer
 
-from ..util import registry
-from ..errors import Errors
 from .. import util
+from ..errors import Errors
+from ..util import registry
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index eca40e3d9..56df53957 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -1,17 +1,28 @@
-from typing import List, Callable, Tuple, Dict, Iterable, Union, Any, IO
-from typing import Optional, TYPE_CHECKING
+import random
+import shutil
+import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
-from wasabi import Printer
-import random
-import sys
-import shutil
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from thinc.api import Config, Optimizer, constant, fix_random_seed, set_gpu_allocator
+from wasabi import Printer
 
-from .example import Example
-from ..schemas import ConfigSchemaTraining
 from ..errors import Errors
-from ..util import resolve_dot_names, registry, logger
+from ..schemas import ConfigSchemaTraining
+from ..util import logger, registry, resolve_dot_names
+from .example import Example
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 52af84aaf..14a813a09 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -1,20 +1,26 @@
-from typing import Optional, Callable, Iterable, Union, List
-from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
-from thinc.api import set_dropout_rate
-from pathlib import Path
-from collections import Counter
-import srsly
-import time
 import re
+import time
+from collections import Counter
+from pathlib import Path
+from typing import Callable, Iterable, List, Optional, Union
 
+import srsly
+from thinc.api import (
+    Config,
+    Model,
+    Optimizer,
+    fix_random_seed,
+    set_dropout_rate,
+    set_gpu_allocator,
+)
 from thinc.config import ConfigValidationError
 from wasabi import Printer
 
-from .example import Example
 from ..errors import Errors
-from ..tokens import Doc
 from ..schemas import ConfigSchemaPretrain
-from ..util import registry, load_model_from_config, dot_to_object
+from ..tokens import Doc
+from ..util import dot_to_object, load_model_from_config, registry
+from .example import Example
 
 
 def pretrain(
@@ -24,6 +30,7 @@ def pretrain(
     epoch_resume: Optional[int] = None,
     use_gpu: int = -1,
     silent: bool = True,
+    skip_last: bool = False,
 ):
     msg = Printer(no_print=silent)
     if config["training"]["seed"] is not None:
@@ -60,10 +67,14 @@ def pretrain(
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 
-    def _save_model(epoch, is_temp=False):
+    def _save_model(epoch, is_temp=False, is_last=False):
         is_temp_str = ".temp" if is_temp else ""
         with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+            if is_last:
+                save_path = output_dir / f"model-last.bin"
+            else:
+                save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
+            with (save_path).open("wb") as file_:
                 file_.write(model.get_ref("tok2vec").to_bytes())
             log = {
                 "nr_word": tracker.nr_word,
@@ -76,22 +87,26 @@ def pretrain(
 
     # TODO: I think we probably want this to look more like the
     # 'create_train_batches' function?
-    for epoch in range(epoch_resume, P["max_epochs"]):
-        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs = ensure_docs(batch)
-            loss = make_update(model, docs, optimizer, objective)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                msg.row(progress, **row_settings)
-            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
-                _save_model(epoch, is_temp=True)
+    try:
+        for epoch in range(epoch_resume, P["max_epochs"]):
+            for batch_id, batch in enumerate(batcher(corpus(nlp))):
+                docs = ensure_docs(batch)
+                loss = make_update(model, docs, optimizer, objective)
+                progress = tracker.update(epoch, loss, docs)
+                if progress:
+                    msg.row(progress, **row_settings)
+                if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
+                    _save_model(epoch, is_temp=True)
 
-        if P["n_save_epoch"]:
-            if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+            if P["n_save_epoch"]:
+                if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+                    _save_model(epoch)
+            else:
                 _save_model(epoch)
-        else:
-            _save_model(epoch)
-        tracker.epoch_loss = 0.0
+            tracker.epoch_loss = 0.0
+    finally:
+        if not skip_last:
+            _save_model(P["max_epochs"], is_last=True)
 
 
 def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
diff --git a/spacy/ty.py b/spacy/ty.py
index 8f2903d78..f389456c0 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,10 +1,20 @@
-from typing import TYPE_CHECKING
-from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+)
+
+from thinc.api import Model, Optimizer
+
 from .compat import Protocol, runtime_checkable
 
-from thinc.api import Optimizer, Model
-
 if TYPE_CHECKING:
+    from .language import Language
     from .training import Example
 
 
@@ -32,7 +42,7 @@ class InitializableComponent(Protocol):
     def initialize(
         self,
         get_examples: Callable[[], Iterable["Example"]],
-        nlp: Iterable["Example"],
+        nlp: "Language",
         **kwargs: Any
     ):
         ...
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 8cdc70e42..72d4d99ac 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,6 +1,4 @@
-from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
-from libc.stdint cimport uint8_t
-
+from libc.stdint cimport int32_t, uint8_t, uint16_t, uint32_t, uint64_t, uintptr_t
 
 ctypedef float weight_t
 ctypedef uint64_t hash_t
diff --git a/spacy/util.py b/spacy/util.py
index 73727ca62..a2a033cbc 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,38 +1,62 @@
-from typing import List, Mapping, NoReturn, Union, Dict, Any, Set, cast
-from typing import Optional, Iterable, Callable, Tuple, Type
-from typing import Iterator, Pattern, Generator, TYPE_CHECKING
-from types import ModuleType
-import os
+import functools
 import importlib
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model
-import functools
-import itertools
-import numpy
-import srsly
-import catalogue
-from catalogue import RegistryError, Registry
-import langcodes
-import sys
-import warnings
-from packaging.specifiers import SpecifierSet, InvalidSpecifier
-from packaging.version import Version, InvalidVersion
-from packaging.requirements import Requirement
-import subprocess
-from contextlib import contextmanager
-from collections import defaultdict
-import tempfile
-import shutil
-import shlex
 import inspect
-import pkgutil
+import itertools
 import logging
+import os
+import pkgutil
+import re
+import shlex
+import shutil
 import socket
 import stat
+import subprocess
+import sys
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+from pathlib import Path
+from types import ModuleType
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Mapping,
+    NoReturn,
+    Optional,
+    Pattern,
+    Set,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)
+
+import catalogue
+import langcodes
+import numpy
+import srsly
+import thinc
+from catalogue import Registry, RegistryError
+from packaging.requirements import Requirement
+from packaging.specifiers import InvalidSpecifier, SpecifierSet
+from packaging.version import InvalidVersion, Version
+from thinc.api import (
+    Adam,
+    Config,
+    ConfigValidationError,
+    Model,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+)
 
 try:
     import cupy.random
@@ -43,13 +67,12 @@ except ImportError:
 # and have since moved to Thinc. We're importing them here so people's code
 # doesn't break, but they should always be imported from Thinc from now on,
 # not from spacy.util.
-from thinc.api import fix_random_seed, compounding, decaying  # noqa: F401
+from thinc.api import compounding, decaying, fix_random_seed  # noqa: F401
 
-
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
+from .compat import CudaStream, cupy, importlib_metadata, is_windows
+from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
+from .symbols import ORTH
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
@@ -510,7 +533,7 @@ def load_model_from_path(
     if not meta:
         meta = get_model_meta(model_path)
     config_path = model_path / "config.cfg"
-    overrides = dict_to_dot(config)
+    overrides = dict_to_dot(config, for_overrides=True)
     config = load_config(config_path, overrides=overrides)
     nlp = load_model_from_config(
         config,
@@ -1455,14 +1478,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
     return result
 
 
-def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
+def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
     """Convert dot notation to a dict. For example: {"token": {"pos": True,
     "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
 
-    values (Dict[str, dict]): The dict to convert.
+    obj (Dict[str, dict]): The dict to convert.
+    for_overrides (bool): Whether to enable special handling for registered
+        functions in overrides.
     RETURNS (Dict[str, Any]): The key/value pairs.
     """
-    return {".".join(key): value for key, value in walk_dict(obj)}
+    return {
+        ".".join(key): value
+        for key, value in walk_dict(obj, for_overrides=for_overrides)
+    }
 
 
 def dot_to_object(config: Config, section: str):
@@ -1504,13 +1532,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
 
 
 def walk_dict(
-    node: Dict[str, Any], parent: List[str] = []
+    node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
 ) -> Iterator[Tuple[List[str], Any]]:
-    """Walk a dict and yield the path and values of the leaves."""
+    """Walk a dict and yield the path and values of the leaves.
+
+    for_overrides (bool): Whether to treat registered functions that start with
+        @ as final values rather than dicts to traverse.
+    """
     for key, value in node.items():
         key_parent = [*parent, key]
-        if isinstance(value, dict):
-            yield from walk_dict(value, key_parent)
+        if isinstance(value, dict) and (
+            not for_overrides
+            or not any(value_key.startswith("@") for value_key in value)
+        ):
+            yield from walk_dict(value, key_parent, for_overrides=for_overrides)
         else:
             yield (key_parent, value)
 
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index be0f6db09..bf79481b8 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,24 +1,27 @@
 cimport numpy as np
-from libc.stdint cimport uint32_t, uint64_t
 from cython.operator cimport dereference as deref
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
 
 import functools
-import numpy
-from typing import cast
 import warnings
 from enum import Enum
+from typing import cast
+
+import numpy
 import srsly
 from thinc.api import Ops, get_array_module, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d
 
+from .attrs cimport ORTH, attr_id_t
 from .strings cimport StringStore
 
-from .strings import get_string_id
-from .errors import Errors, Warnings
 from . import util
+from .attrs import IDS
+from .errors import Errors, Warnings
+from .strings import get_string_id
 
 
 def unpickle_vectors(bytes_data):
@@ -63,8 +66,9 @@ cdef class Vectors:
     cdef readonly uint32_t hash_seed
     cdef readonly unicode bow
     cdef readonly unicode eow
+    cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
         """Create a new vector store.
 
         strings (StringStore): The string store.
@@ -79,6 +83,8 @@ cdef class Vectors:
         hash_seed (int): The floret hash seed (default: 0).
         bow (str): The floret BOW string (default: "<").
         eow (str): The floret EOW string (default: ">").
+        attr (Union[int, str]): The token attribute for the vector keys
+            (default: "ORTH").
 
         DOCS: https://spacy.io/api/vectors#init
         """
@@ -102,6 +108,14 @@ cdef class Vectors:
         self.hash_seed = hash_seed
         self.bow = bow
         self.eow = eow
+        if isinstance(attr, (int, long)):
+            self.attr = attr
+        else:
+            attr = attr.upper()
+            if attr == "TEXT":
+                attr = "ORTH"
+            self.attr = IDS.get(attr, ORTH)
+
         if self.mode == Mode.default:
             if data is None:
                 if shape is None:
@@ -545,6 +559,7 @@ cdef class Vectors:
                 "hash_seed": self.hash_seed,
                 "bow": self.bow,
                 "eow": self.eow,
+                "attr": self.attr,
             }
 
     def _set_cfg(self, cfg):
@@ -555,6 +570,7 @@ cdef class Vectors:
         self.hash_seed = cfg.get("hash_seed", 0)
         self.bow = cfg.get("bow", "<")
         self.eow = cfg.get("eow", ">")
+        self.attr = cfg.get("attr", ORTH)
 
     def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 9c951b2b7..3b0173e3e 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -1,12 +1,12 @@
-from libcpp.vector cimport vector
-from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
+from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
+from preshed.maps cimport PreshMap
 
+from .morphology cimport Morphology
+from .strings cimport StringStore
 from .structs cimport LexemeC, TokenC
 from .typedefs cimport attr_t, hash_t
-from .strings cimport StringStore
-from .morphology cimport Morphology
 
 
 cdef LexemeC EMPTY_LEXEME
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 4cc359c47..b7ff20348 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -1,14 +1,15 @@
-from typing import Callable, Iterator, Optional, Union, List, Dict
-from typing import Any, Iterable
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
+
 from thinc.types import Floats1d, FloatsXd
+
 from . import Language
-from .strings import StringStore
 from .lexeme import Lexeme
 from .lookups import Lookups
 from .morphology import Morphology
+from .strings import StringStore
 from .tokens import Doc, Span
 from .vectors import Vectors
-from pathlib import Path
 
 def create_vocab(
     lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 27f8e5f98..520228b51 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,26 +1,27 @@
 # cython: profile=True
 from libc.string cimport memcpy
 
+import functools
+
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
-import functools
 
-from .lexeme cimport EMPTY_LEXEME, OOV_RANK
-from .lexeme cimport Lexeme
-from .typedefs cimport attr_t
-from .tokens.token cimport Token
 from .attrs cimport LANG, ORTH
+from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
+from .tokens.token cimport Token
+from .typedefs cimport attr_t
 
+from . import util
+from .attrs import IS_STOP, NORM, intify_attrs
 from .compat import copy_reg
 from .errors import Errors
-from .attrs import intify_attrs, NORM, IS_STOP
-from .vectors import Vectors, Mode as VectorsMode
-from .util import registry
-from .lookups import Lookups
-from . import util
+from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
 from .lang.norm_exceptions import BASE_NORMS
-from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
+from .lookups import Lookups
+from .util import registry
+from .vectors import Mode as VectorsMode
+from .vectors import Vectors
 
 
 def create_vocab(lang, defaults, vectors_name=None):
@@ -364,8 +365,13 @@ cdef class Vocab:
             self[orth]
         # Make prob negative so it sorts by rank ascending
         # (key2row contains the rank)
-        priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
-                    for lex in self if lex.orth in self.vectors.key2row]
+        priority = []
+        cdef Lexeme lex
+        cdef attr_t value
+        for lex in self:
+            value = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
+            if value in self.vectors.key2row:
+                priority.append((-lex.prob, self.vectors.key2row[value], value))
         priority.sort()
         indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
@@ -398,8 +404,10 @@ cdef class Vocab:
         """
         if isinstance(orth, str):
             orth = self.strings.add(orth)
-        if self.has_vector(orth):
-            return self.vectors[orth]
+        cdef Lexeme lex = self[orth]
+        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
+        if self.has_vector(key):
+            return self.vectors[key]
         xp = get_array_module(self.vectors.data)
         vectors = xp.zeros((self.vectors_length,), dtype="f")
         return vectors
@@ -415,15 +423,16 @@ cdef class Vocab:
         """
         if isinstance(orth, str):
             orth = self.strings.add(orth)
-        if self.vectors.is_full and orth not in self.vectors:
+        cdef Lexeme lex = self[orth]
+        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
+        if self.vectors.is_full and key not in self.vectors:
             new_rows = max(100, int(self.vectors.shape[0]*1.3))
             if self.vectors.shape[1] == 0:
                 width = vector.size
             else:
                 width = self.vectors.shape[1]
             self.vectors.resize((new_rows, width))
-        lex = self[orth]  # Add word to vocab if necessary
-        row = self.vectors.add(orth, vector=vector)
+        row = self.vectors.add(key, vector=vector)
         if row >= 0:
             lex.rank = row
 
@@ -438,7 +447,9 @@ cdef class Vocab:
         """
         if isinstance(orth, str):
             orth = self.strings.add(orth)
-        return orth in self.vectors
+        cdef Lexeme lex = self[orth]
+        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
+        return key in self.vectors
 
     property lookups:
         def __get__(self):
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 268c04a07..bab24f13b 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on
 | `nM`        | The width of the static vectors. ~~Optional[int]~~                                                                                                                                                                      |
 | `dropout`   | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~                                                                                                 |
 | `init_W`    | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
-| `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
+| `key_attr`  | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~       |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 
 ### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 2bb0199fc..6a87f78b8 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -211,7 +211,8 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~                                                                                                                                                |
+| `--attr`, `-a`     | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~                                                                                                                                                                   |
 | `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
@@ -1122,17 +1123,18 @@ auto-generated by setting `--pretraining` on
 $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```
 
-| Name                    | Description                                                                                                                                                                                                        |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
-| `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
-| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
-| `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
-| overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
-| **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
+| Name                                               | Description                                                                                                                                                                                                        |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
+| `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
+| `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
+| `--gpu-id`, `-g`                                   | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
+| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                 |
+| `--help`, `-h`                                     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
+| overrides                                          | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
+| **CREATES**                                        | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
 
 ## evaluate {id="evaluate",version="2",tag="command"}
 
@@ -1162,18 +1164,19 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 ```
 
-| Name                                      | Description                                                                                                                                                                          |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                   | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`                               | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
-| `--output`, `-o`                          | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
-| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G`                    | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--displacy-path`, `-dp`                  | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
-| `--displacy-limit`, `-dl`                 | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
-| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |
+| Name                                                 | Description                                                                                                                                                                          |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
+| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
+| `--code`, `-c` <Tag variant="new">3</Tag>            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
+| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
+| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
+| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
 
 ### speed {id="benchmark-speed", version="3.5", tag="command"}
 
@@ -1219,7 +1222,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
 | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
 | `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`, `-o`       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
+| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
 | `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
 | `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
@@ -1639,7 +1642,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
 see the spaCy project [integration](/usage/projects#huggingface_hub).
 
 ```bash
-$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
+$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
 ```
 
 > #### Example
@@ -1653,6 +1656,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
 | `whl_path`           | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~                             |
 | `--org`, `-o`        | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~                                                        |
 | `--msg`, `-m`        | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~                                                       |
-| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
 | `--verbose`, `-V`    | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~                                                     |
 | **UPLOADS**          | The pipeline to the hub.                                                                                                                        |
diff --git a/website/docs/api/coref.mdx b/website/docs/api/coref.mdx
index 8647f35d1..0b9ebb888 100644
--- a/website/docs/api/coref.mdx
+++ b/website/docs/api/coref.mdx
@@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
 > config={
 >     "model": DEFAULT_COREF_MODEL,
 >     "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
-> },
+> }
 > nlp.add_pipe("experimental_coref", config=config)
 > ```
 
diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx
index 14e0916d1..d0971da55 100644
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 
-| Symbol                                  | Description                                                                                                          |
-| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
-| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
-| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
-| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
-| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
-| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
-| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
-| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
-| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
-| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
-| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
-| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
-| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
-| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
-| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
-| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
-| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| Symbol                                  | Description                                                                                                                     |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                                          |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                               |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                                         |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                              |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                              |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_.                 |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_.  |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_.                  |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                            |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                             |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                           |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                            |
+| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                     |
+| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                      |
+| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`.                                                       |
+| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`.                                                        |
+| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                     |
+| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                      |
+| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`.                                                       |
+| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`.                                                        |
 
 ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
 
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index d84dd3ca9..21d2e9015 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -64,7 +64,7 @@ architectures and their arguments and hyperparameters.
 | `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
 | `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
 | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
+| `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
 | `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
 | `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
 | `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index c24fe78d6..15b1d3bf2 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -292,7 +292,7 @@ Restore the state of the knowledge base from a given directory. Note that the
 > ```python
 > from spacy.vocab import Vocab
 > vocab = Vocab().from_disk("/path/to/vocab")
-> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
+> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
 > kb.from_disk("/path/to/kb")
 > ```
 
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 93ddd79a2..de23156b9 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
 > print(scores)
 > ```
 
-| Name            | Description                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
-| _keyword-only_  |                                                                                                                                                |
-| `batch_size`    | The batch size to use. ~~Optional[int]~~                                                                                                       |
-| `scorer`        | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~                                     |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| `scorer_cfg`    | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~                                    |
-| **RETURNS**     | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~                                                               |
+| Name                                         | Description                                                                                                                                    |
+| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`                                   | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
+| _keyword-only_                               |                                                                                                                                                |
+| `batch_size`                                 | The batch size to use. ~~Optional[int]~~                                                                                                       |
+| `scorer`                                     | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~                                     |
+| `component_cfg`                              | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `scorer_cfg`                                 | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~                                    |
+| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~                                                            |
+| **RETURNS**                                  | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~                                                               |
 
 ## Language.use_params {id="use_params",tag="contextmanager, method"}
 
diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx
index 68d80b814..018ce2524 100644
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@@ -213,10 +213,11 @@ Retrieve values for a feature by field.
 > assert morph.get("Feat1") == ["Val1", "Val2"]
 > ```
 
-| Name        | Description                                      |
-| ----------- | ------------------------------------------------ |
-| `field`     | The field to retrieve. ~~str~~                   |
-| **RETURNS** | A list of the individual features. ~~List[str]~~ |
+| Name                                     | Description                                                                                                                    |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `field`                                  | The field to retrieve. ~~str~~                                                                                                 |
+| `default` <Tag variant="new">3.5.3</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
+| **RETURNS**                              | A list of the individual features. ~~List[str]~~                                                                               |
 
 ### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
 
diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx
index 6f0c95f6f..9bdd0a8f4 100644
--- a/website/docs/api/scorer.mdx
+++ b/website/docs/api/scorer.mdx
@@ -33,7 +33,7 @@ Create a new `Scorer`.
 | `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~                                                                                                                                            |
 | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~                                                     |
 | _keyword-only_     |                                                                                                                                                                                                                                           |
-| `\*\*kwargs`       | Any additional settings to pass on to the individual scoring methods. ~~Any~~                                                                                                                                                             |
+| `**kwargs`         | Any additional settings to pass on to the individual scoring methods. ~~Any~~                                                                                                                                                             |
 
 ## Scorer.score {id="score",tag="method"}
 
@@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
 > scores = scorer.score(examples)
 > ```
 
-| Name        | Description                                                                                                         |
-| ----------- | ------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
-| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~                                               |
+| Name                                         | Description                                                                                                         |
+| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples`                                   | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_                               |                                                                                                                     |
+| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~                                 |
+| **RETURNS**                                  | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~                                               |
 
 ## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
 
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index c7de2324b..81a473ac2 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -20,8 +20,9 @@ output class probabilities are independent for each class. However, if you need
 to predict at most one true class for a span, then use `spancat_singlelabel`. It
 uses a `Softmax` layer and treats the task as a multi-class problem.
 
-Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
-Individual span scores can be found in `spangroup.attrs["scores"]`.
+Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
+under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
+Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
 
 ## Assigned Attributes {id="assigned-attributes"}
 
@@ -29,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
 [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
 be saved in `SpanGroup.attrs["scores"]`.
 
-`spans_key` defaults to `"sc"`, but can be passed as a parameter.
+`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
+component will overwrite any existing spans under the spans key
+`doc.spans[spans_key]`.
 
 | Location                               | Value                                                    |
 | -------------------------------------- | -------------------------------------------------------- |
@@ -102,7 +105,7 @@ architectures and their arguments and hyperparameters.
 >
 > # Construction via add_pipe with custom model
 > config = {"model": {"@architectures": "my_spancat"}}
-> parser = nlp.add_pipe("spancat", config=config)
+> spancat = nlp.add_pipe("spancat", config=config)
 >
 > # Construction from class
 > from spacy.pipeline import SpanCategorizer
@@ -521,3 +524,22 @@ has two columns, indicating the start and end position.
 | `min_size`  | The minimal phrase lengths to suggest (inclusive). ~~[int]~~                 |
 | `max_size`  | The maximal phrase lengths to suggest (exclusive). ~~[int]~~                 |
 | **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
+
+### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
+
+> #### Example Config
+>
+> ```ini
+> [components.spancat.suggester]
+> @misc = "spacy.preset_spans_suggester.v1"
+> spans_key = "my_spans"
+> ```
+
+Suggest all spans that are already stored in doc.spans[spans_key]. This is
+useful when an upstream component is used to set the spans on the Doc such as a
+[`SpanRuler`](/api/spanruler) or [`SpanFinder`](/api/spanfinder).
+
+| Name        | Description                                                                   |
+| ----------- | ----------------------------------------------------------------------------- |
+| `spans_key` | Key of [`Doc.spans`](/api/doc/#spans) that provides spans to suggest. ~~str~~ |
+| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~  |
diff --git a/website/docs/api/spanfinder.mdx b/website/docs/api/spanfinder.mdx
new file mode 100644
index 000000000..ca3104c85
--- /dev/null
+++ b/website/docs/api/spanfinder.mdx
@@ -0,0 +1,372 @@
+---
+title: SpanFinder
+tag: class,experimental
+source: spacy/pipeline/span_finder.py
+version: 3.6
+teaser:
+  'Pipeline component for identifying potentially overlapping spans of text'
+api_base_class: /api/pipe
+api_string_name: span_finder
+api_trainable: true
+---
+
+The span finder identifies potentially overlapping, unlabeled spans. It
+identifies tokens that start or end spans and annotates unlabeled spans between
+starts and ends, with optional filters for min and max span length. It is
+intended for use in combination with a component like
+[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
+spans. Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the
+doc under `doc.spans[spans_key]`, where `spans_key` is a component config
+setting.
+
+## Assigned Attributes {id="assigned-attributes"}
+
+Predictions will be saved to `Doc.spans[spans_key]` as a
+[`SpanGroup`](/api/spangroup).
+
+`spans_key` defaults to `"sc"`, but can be passed as a parameter. The
+`span_finder` component will overwrite any existing spans under the spans key
+`doc.spans[spans_key]`.
+
+| Location               | Value                              |
+| ---------------------- | ---------------------------------- |
+| `Doc.spans[spans_key]` | The unlabeled spans. ~~SpanGroup~~ |
+
+## Config and implementation {id="config"}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures) documentation for details on the
+architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy.pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
+> config = {
+>     "threshold": 0.5,
+>     "spans_key": "my_spans",
+>     "max_length": None,
+>     "min_length": None,
+>     "model": DEFAULT_SPAN_FINDER_MODEL,
+> }
+> nlp.add_pipe("span_finder", config=config)
+> ```
+
+| Setting      | Description                                                                                                                                                                                                            |
+| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`      | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~                                                                                           |
+| `spans_key`  | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold`  | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~                                                                                                                                    |
+| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~                                                                                                                   |
+| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~                                                                                                          |
+| `scorer`     | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                      |
+
+```python
+%%GITHUB_SPACY/spacy/pipeline/span_finder.py
+```
+
+## SpanFinder.\_\_init\_\_ {id="init",tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> span_finder = nlp.add_pipe("span_finder")
+>
+> # Construction via add_pipe with custom model
+> config = {"model": {"@architectures": "my_span_finder"}}
+> span_finder = nlp.add_pipe("span_finder", config=config)
+>
+> # Construction from class
+> from spacy.pipeline import SpanFinder
+> span_finder = SpanFinder(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#create_pipe).
+
+| Name           | Description                                                                                                                                                                                                            |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                       |
+| `model`        | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~                                                                                           |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                    |
+| _keyword-only_ |                                                                                                                                                                                                                        |
+| `spans_key`    | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold`    | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~                                                                                                                                    |
+| `max_length`   | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~                                                                                                                   |
+| `min_length`   | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~                                                                                                          |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                      |
+
+## SpanFinder.\_\_call\_\_ {id="call",tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/spanfinder#call) and [`pipe`](/api/spanfinder#pipe) delegate
+to the [`predict`](/api/spanfinder#predict) and
+[`set_annotations`](/api/spanfinder#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> span_finder = nlp.add_pipe("span_finder")
+> # This usually happens under the hood
+> processed = span_finder(doc)
+> ```
+
+| Name        | Description                      |
+| ----------- | -------------------------------- |
+| `doc`       | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~  |
+
+## SpanFinder.pipe {id="pipe",tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/spanfinder#call) and
+[`pipe`](/api/spanfinder#pipe) delegate to the
+[`predict`](/api/spanfinder#predict) and
+[`set_annotations`](/api/spanfinder#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> for doc in span_finder.pipe(docs, batch_size=50):
+>     pass
+> ```
+
+| Name           | Description                                                   |
+| -------------- | ------------------------------------------------------------- |
+| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
+| _keyword-only_ |                                                               |
+| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
+
+## SpanFinder.initialize {id="initialize",tag="method"}
+
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. **At least one example
+should be supplied.** The data examples are used to **initialize the model** of
+the component and can either be the full training data or a representative
+sample. Initialization includes validating the network and
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) This
+method is typically called by [`Language.initialize`](/api/language#initialize)
+and lets you customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name           | Description                                                                                                                                                                |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ |                                                                                                                                                                            |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                       |
+
+## SpanFinder.predict {id="predict",tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
+modifying them.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict([doc1, doc2])
+> ```
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The model's prediction for each document.   |
+
+## SpanFinder.set_annotations {id="set_annotations",tag="method"}
+
+Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict(docs)
+> span_finder.set_annotations(docs, scores)
+> ```
+
+| Name     | Description                                          |
+| -------- | ---------------------------------------------------- |
+| `docs`   | The documents to modify. ~~Iterable[Doc]~~           |
+| `scores` | The scores to set, produced by `SpanFinder.predict`. |
+
+## SpanFinder.update {id="update",tag="method"}
+
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/spanfinder#predict) and
+[`get_loss`](/api/spanfinder#get_loss).
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> optimizer = nlp.initialize()
+> losses = span_finder.update(examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                              |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
+| _keyword-only_ |                                                                                                                          |
+| `drop`         | The dropout rate. ~~float~~                                                                                              |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
+
+## SpanFinder.get_loss {id="get_loss",tag="method"}
+
+Find the loss and gradient of loss for the batch of documents and their
+predicted scores.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> scores = span_finder.predict([eg.predicted for eg in examples])
+> loss, d_loss = span_finder.get_loss(examples, scores)
+> ```
+
+| Name           | Description                                                                    |
+| -------------- | ------------------------------------------------------------------------------ |
+| `examples`     | The batch of examples. ~~Iterable[Example]~~                                   |
+| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~       |
+| **RETURNS**    | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, Floats2d]~~ |
+
+## SpanFinder.create_optimizer {id="create_optimizer",tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> optimizer = span_finder.create_optimizer()
+> ```
+
+| Name        | Description                  |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## SpanFinder.use_params {id="use_params",tag="method, contextmanager"}
+
+Modify the pipe's model to use the given parameter values.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> with span_finder.use_params(optimizer.averages):
+>     span_finder.to_disk("/best_model")
+> ```
+
+| Name     | Description                                        |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## SpanFinder.to_disk {id="to_disk",tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.to_disk("/path/to/span_finder")
+> ```
+
+| Name           | Description                                                                                                                                |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ |                                                                                                                                            |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
+
+## SpanFinder.from_disk {id="from_disk",tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.from_disk("/path/to/span_finder")
+> ```
+
+| Name           | Description                                                                                     |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ |                                                                                                 |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
+| **RETURNS**    | The modified `SpanFinder` object. ~~SpanFinder~~                                                |
+
+## SpanFinder.to_bytes {id="to_bytes",tag="method"}
+
+> #### Example
+>
+> ```python
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder_bytes = span_finder.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name           | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ |                                                                                             |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS**    | The serialized form of the `SpanFinder` object. ~~bytes~~                                   |
+
+## SpanFinder.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_finder_bytes = span_finder.to_bytes()
+> span_finder = nlp.add_pipe("span_finder")
+> span_finder.from_bytes(span_finder_bytes)
+> ```
+
+| Name           | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
+| _keyword-only_ |                                                                                             |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS**    | The `SpanFinder` object. ~~SpanFinder~~                                                     |
+
+## Serialization fields {id="serialization-fields"}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = span_finder.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name    | Description                                                    |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab).                              |
+| `cfg`   | The config file. You usually don't want to exclude this.       |
+| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 47d3715c1..6a3e9d664 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 
+<Infobox variant ="warning">
+
+Note that a `StringStore` instance is not static. It increases in size as texts
+with new tokens are processed.
+
+</Infobox>
+
 ## StringStore.\_\_init\_\_ {id="init",tag="method"}
 
 Create the `StringStore`.
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index d0851a59f..64ec342cd 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
 spaCy will assume it's a data directory, load its
 [`config.cfg`](/api/data-formats#config) and use the language and pipeline
 information to construct the `Language` class. The data will be loaded in via
-[`Language.from_disk`](/api/language#from_disk).
+[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
+package will also import any custom code, if present, whereas loading from a
+directory does not. For these cases, you need to manually import your custom
+code.
 
 <Infobox variant="warning" title="Changed in v3.0">
 
@@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
 
 | Name        | Description                                                         |
 | ----------- | ------------------------------------------------------------------- |
-| `orig_doc`  | Doc to parse dependencies. ~~Doc~~                                  |
+| `orig_doc`  | Doc or span to parse dependencies. ~~Union[Doc, Span]~~             |
 | `options`   | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
 | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~        |
 
@@ -466,7 +469,7 @@ factories.
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
 | `readers`         | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                   |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                               |
-| `scorers`         | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`.                                |
+| `scorers`         | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`.                                  |
 | `tokenizers`      | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable.                                                                   |
 
 ### spacy-transformers registry {id="registry-transformers"}
@@ -577,7 +580,7 @@ start decreasing across epochs.
 > ```ini
 > [training.logger]
 > @loggers = "spacy.ConsoleLogger.v3"
-> progress_bar = "all_steps"
+> progress_bar = "eval"
 > console_output = true
 > output_file = "training_log.jsonl"
 > ```
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index d6033c096..fa4cd0c7a 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -60,6 +60,7 @@ modified later.
 | `hash_seed` <Tag variant="new">3.2</Tag>  | The floret hash seed (default: `0`). ~~int~~                                                                                                                                           |
 | `bow` <Tag variant="new">3.2</Tag>        | The floret BOW string (default: `"<"`). ~~str~~                                                                                                                                        |
 | `eow` <Tag variant="new">3.2</Tag>        | The floret EOW string (default: `">"`). ~~str~~                                                                                                                                        |
+| `attr` <Tag variant="new">3.6</Tag>       | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~                                                                                                       |
 
 ## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
 
@@ -453,8 +454,9 @@ Load state from a binary string.
 
 ## Attributes {id="attributes"}
 
-| Name      | Description                                                                                                                                                          |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `data`    | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~   |
-| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~                                                                               |
-| `keys`    | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
+| Name                                | Description                                                                                                                                                          |
+| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `data`                              | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~   |
+| `key2row`                           | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~                                                                               |
+| `keys`                              | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
+| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~                                                                                                                     |
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 131e4ce0a..fe774d1a8 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 
+<Infobox variant ="warning">
+
+Note that a `Vocab` instance is not static. It increases in size as texts with
+new tokens are processed.
+
+</Infobox>
+
 ## Vocab.\_\_init\_\_ {id="init",tag="method"}
 
 Create the vocabulary.
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index cf80822fb..5f1e5b817 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
 that you want to use from pretraining.
 
 A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
-an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
-make use of the final output, you could fill in this value in your config file:
+an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
+copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
+configure `n_save_epoch` to tell pretraining in which epoch interval it should
+save the current training progress. To use the final output to initialize your
+`tok2vec` layer, you could fill in this value in your config file:
 
 ```ini {title="config.cfg"}
 
 [paths]
-init_tok2vec = "pretrain/model4.bin"
+init_tok2vec = "pretrain/model-last.bin"
 
 [initialize]
 init_tok2vec = ${paths.init_tok2vec}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index a5b7990d6..4b06178d5 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -259,6 +259,26 @@ source code and recompiling frequently.
   $ python setup.py develop
   ```
 
+#### Visual Studio Code extension
+
+![spaCy extension demo](/images/spacy-extension-demo.gif) 
+
+The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
+additional tooling and features for working with spaCy's config files. Version
+1.0.0 includes hover descriptions for registry functions, variables, and section
+names within the config as an installable extension.
+
+1. Install a supported version of Python on your system (`>=3.7`)
+2. Install the
+   [Python Extension for Visual Studio Code](https://code.visualstudio.com/docs/python/python-tutorial)
+3. Create a
+   [virtual python environment](https://docs.python.org/3/library/venv.html)
+4. Install all python requirements (`spaCy >= 3.4.0` & `pygls >= 1.0.0`)
+5. Install
+   [spaCy extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension)
+6. Select your python environment
+7. You are ready to work with `.cfg` files in spaCy!
+
 ### Building an executable {id="executable"}
 
 The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 55c043015..39be5f47b 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 
-| Symbol                                  | Description                                                                                                          |
-| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
-| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
-| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
-| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
-| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
-| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
-| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
-| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
-| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
-| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
-| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
-| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
-| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
-| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
-| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
-| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
-| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| Symbol                                  | Description                                                                                                                     |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                                          |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                               |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                                         |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                              |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                              |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_.                 |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_.  |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_.                  |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                            |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                             |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                           |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                            |
+| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                     |
+| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                      |
+| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`.                                                       |
+| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`.                                                        |
+| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                     |
+| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                      |
+| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`.                                                       |
+| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`.                                                        |
 
 ### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
 
@@ -1682,6 +1682,8 @@ def expand_person_entities(doc):
             if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                 new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                 new_ents.append(new_ent)
+            else:
+                new_ents.append(ent)
         else:
             new_ents.append(ent)
     doc.ents = new_ents
diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx
index 6cda975cb..98333db72 100644
--- a/website/docs/usage/training.mdx
+++ b/website/docs/usage/training.mdx
@@ -11,7 +11,6 @@ menu:
   - ['Custom Functions', 'custom-functions']
   - ['Initialization', 'initialization']
   - ['Data Utilities', 'data']
-  - ['Parallel Training', 'parallel-training']
   - ['Internal API', 'api']
 ---
 
@@ -758,6 +757,15 @@ any custom architectures, functions or
 your pipeline and registered when it's loaded. See the documentation on
 [saving and loading pipelines](/usage/saving-loading#models-custom) for details.
 
+<Infobox variant="warning">
+
+Note that the unpackaged models produced by `spacy train` are data directories
+that **do not include custom code**. You need to import the code in your script
+before loading in unpackaged models. For more details, see
+[`spacy.load`](/api/top-level#spacy.load).
+
+</Infobox>
+
 #### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
 
 For many use cases, you don't necessarily want to implement the whole `Language`
@@ -1556,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need
 to take care to adjust the `Example` object so its annotations match and remain
 valid.
 
-## Parallel & distributed training with Ray {id="parallel-training"}
-
-> #### Installation
->
-> ```bash
-> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
-> # Check that the CLI is registered
-> $ python -m spacy ray --help
-> ```
-
-[Ray](https://ray.io/) is a fast and simple framework for building and running
-**distributed applications**. You can use Ray to train spaCy on one or more
-remote machines, potentially speeding up your training process. Parallel
-training won't always be faster though – it depends on your batch size, models,
-and hardware.
-
-<Infobox variant="warning">
-
-To use Ray with spaCy, you need the
-[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
-Installing the package will automatically add the `ray` command to the spaCy
-CLI.
-
-</Infobox>
-
-The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
-[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
-setup. You can optionally set the `--address` option to point to your Ray
-cluster. If it's not set, Ray will run locally.
-
-```bash
-python -m spacy ray train config.cfg --n-workers 2
-```
-
-<Project id="integrations/ray">
-
-Get started with parallel training using our project template. It trains a
-simple model on a Universal Dependencies Treebank and lets you parallelize the
-training with Ray.
-
-</Project>
-
-### How parallel training works {id="parallel-training-details"}
-
-Each worker receives a shard of the **data** and builds a copy of the **model
-and optimizer** from the [`config.cfg`](#config). It also has a communication
-channel to **pass gradients and parameters** to the other workers. Additionally,
-each worker is given ownership of a subset of the parameter arrays. Every
-parameter array is owned by exactly one worker, and the workers are given a
-mapping so they know which worker owns which parameter.
-
-![Illustration of setup](/images/spacy-ray.svg)
-
-As training proceeds, every worker will be computing gradients for **all** of
-the model parameters. When they compute gradients for parameters they don't own,
-they'll **send them to the worker** that does own that parameter, along with a
-version identifier so that the owner can decide whether to discard the gradient.
-Workers use the gradients they receive and the ones they compute locally to
-update the parameters they own, and then broadcast the updated array and a new
-version ID to the other workers.
-
-This training procedure is **asynchronous** and **non-blocking**. Workers always
-push their gradient increments and parameter updates, they do not have to pull
-them and block on the result, so the transfers can happen in the background,
-overlapped with the actual training work. The workers also do not have to stop
-and wait for each other ("synchronize") at the start of each batch. This is very
-useful for spaCy, because spaCy is often trained on long documents, which means
-**batches can vary in size** significantly. Uneven workloads make synchronous
-gradient descent inefficient, because if one batch is slow, all of the other
-workers are stuck waiting for it to complete before they can continue.
-
 ## Internal training API {id="api"}
 
 <Infobox variant="danger">
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index c372744de..1ac931753 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -56,14 +56,19 @@ wrap. So if you come across this problem, especially when using custom labels,
 you'll have to increase the `distance` setting in the `options` to allow longer
 arcs.
 
+Moreover, you might need to modify the `offset_x` argument depending on the shape
+of your document. Otherwise, the left part of the document may overflow beyond the
+container's border.
+
 </Infobox>
 
-| Argument  | Description                                                                                                                                                                                                                                   |
-| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
-| `color`   | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
-| `bg`      | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
-| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
+| Argument   | Description                                                                                                                                                                                                                                   |
+| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `compact`  | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
+| `color`    | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
+| `bg`       | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font`     | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
+| `offset_x` | Spacing on left side of the SVG in px. You might need to tweak this setting for long texts. Defaults to `50`. ~~int~~                                                                                                                         |
 
 For a list of all available options, see the
 [`displacy` API documentation](/api/top-level#displacy_options).
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 46c0d3adb..f88d2b7bf 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -264,6 +264,11 @@
             "code": "mr",
             "name": "Marathi"
         },
+        {
+            "code": "ms",
+            "name": "Malay",
+            "has_examples": true
+        },
         {
             "code": "nb",
             "name": "Norwegian Bokmål",
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index b5c555da6..12c3fce35 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -106,6 +106,7 @@
                     { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
                     { "text": "Sentencizer", "url": "/api/sentencizer" },
                     { "text": "SpanCategorizer", "url": "/api/spancategorizer" },
+                    { "text": "SpanFinder", "url": "/api/spanfinder" },
                     { "text": "SpanResolver", "url": "/api/span-resolver" },
                     { "text": "SpanRuler", "url": "/api/spanruler" },
                     { "text": "Tagger", "url": "/api/tagger" },
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 16e3bc361..cd3bedbff 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,29 +1,143 @@
 {
     "resources": [
+        {
+            "id": "spacy-vscode",
+            "title": "spaCy Visual Studio Code Extension",
+            "thumb": "https://raw.githubusercontent.com/explosion/spacy-vscode/main/icon.png",
+            "slogan": "Work with spaCy's config files in VS Code",
+            "description": "The spaCy VS Code Extension provides additional tooling and features for working with spaCy's config files. Version 1.0.0 includes hover descriptions for registry functions, variables, and section names within the config as an installable extension.",
+            "url": "https://marketplace.visualstudio.com/items?itemName=Explosion.spacy-extension",
+            "github": "explosion/spacy-vscode",
+            "code_language": "python",
+            "author": "Explosion",
+            "author_links": {
+                "twitter": "@explosion_ai",
+                "github": "explosion"
+            },
+            "category": ["extension"],
+            "tags": []
+        },
+        {
+            "id": "parsigs",
+            "title": "parsigs",
+            "slogan": "Structuring prescriptions text made simple using spaCy",
+            "description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`",
+            "github": "royashcenazi/parsigs",
+            "pip": "parsigs",
+            "code_language": "python",
+            "author": "Roy Ashcenazi",
+            "code_example": [
+                "# You'll need to install the trained model, see instructions in the description section",
+                "from parsigs.parse_sig_api import StructuredSig, SigParser",
+                "sig_parser = SigParser()",
+                "",
+                "sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'",
+                "parsed_sig = sig_parser.parse(sig)"
+            ],
+            "author_links": {
+                "github": "royashcenazi"
+            },
+            "category": ["model", "research", "biomedical"],
+            "tags": ["sigs", "prescription","pharma"]
+        },
+        {
+            "id": "latincy",
+            "title": "LatinCy",
+            "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
+            "slogan": "Synthetic trained spaCy pipelines for Latin NLP",
+            "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
+            "url": "https://huggingface.co/latincy",
+            "code_example": [
+                "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
+                "import spacy",
+                "nlp = spacy.load('la_core_web_lg')",
+                "doc = nlp('Haec narrantur a poetis de Perseo')",
+                "",
+                "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
+                "",
+                "# > Haec, haec, hic, DET"
+            ],
+            "code_language": "python",
+            "author": "Patrick J. Burns",
+            "author_links": {
+                "twitter": "@diyclassics",
+                "github": "diyclassics",
+                "website": "https://diyclassics.github.io/"
+            },
+            "category": ["pipeline", "research"],
+            "tags": ["latin"]
+        },
+        {
+            "id": "spacy-wasm",
+            "title": "spacy-wasm",
+            "slogan": "spaCy in the browser using WebAssembly",
+            "description": "Run spaCy directly in the browser with WebAssembly. Using Pyodide, the application loads the spaCy model and renders the text prompt with displaCy.",
+            "url": "https://spacy-wasm.vercel.app/",
+            "github": "SyedAhkam/spacy-wasm",
+            "code_language": "python",
+            "author": "Syed Ahkam",
+            "author_links": {
+                "twitter": "@SyedAhkam1",
+                "github": "SyedAhkam"
+            },
+            "category": ["visualizers"],
+            "tags": ["visualization", "deployment"]
+        },
+        {
+            "id": "spacysee",
+            "title": "spaCysee",
+            "slogan": "Visualize spaCy's Dependency Parsing, POS tagging, and morphological analysis",
+            "description": "A project that helps you visualize your spaCy docs in Jupyter notebooks. Each of the dependency tags, POS tags and morphological features are clickable. Clicking on a tag will bring up the relevant documentation for that tag.",
+            "github": "moxley01/spacysee",
+            "pip": "spacysee",
+            "code_example": [
+                "import spacy",
+                "from spacysee import render",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = nlp('This is a neat way to visualize your spaCy docs')",
+                "render(doc, width='500', height='500')"
+            ],
+            "code_language": "python",
+            "thumb": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
+            "image": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
+            "author": "Matt Oxley",
+            "author_links": {
+                "twitter": "matt0xley",
+                "github": "moxley01",
+                "website": "https://mattoxley.com"
+            },
+            "category": ["visualizers"],
+            "tags": ["visualization"]
+        },
         {
             "id": "grecy",
             "title": "greCy",
             "slogan": "Ancient Greek pipelines for spaCy",
-            "description": "greCy offers state-of-the-art pipelines for ancient Greek NLP. The repository makes language models available in various sizes, some of them containing floret word vectors and a BERT transformer layer.",
+            "description": "greCy offers state-of-the-art pipelines for ancient Greek NLP. It installs language models available in various sizes, some of them containing either word vectors or the aristoBERTo transformer.",
             "github": "jmyerston/greCy",
+            "pip": "grecy",
             "code_example": [
-                "import spacy",
-                "#After installing the grc_ud_proiel_trf wheel package from the greCy repository",
+                "python -m grecy install grc_proiel_trf",
                 "",
-                "nlp = spacy.load('grc_ud_proiel_trf')",
-                "doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι.')",
+                "#After installing grc_proiel_trf or any other model",
+                "import spacy",
+                "",
+                "nlp = spacy.load('grc_proiel_trf')",
+                "doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι')",
                 "",
                 "for token in doc:",
-                "   print(token.text, token.norm_, token.lemma_, token.pos_, token.tag_)"
+                "   print(f'{token.text}, lemma: {token.lemma_}, pos: {token.pos_}, dep: {token.dep_}')"
             ],
             "code_language": "python",
+            "thumb": "https://jacobo-syntax.hf.space/media/03a5317fa660c142e41dd2870b4273ce4e668e6fcdee0a276891f563.png",
             "author": "Jacobo Myerston",
             "author_links": {
                 "twitter": "@jcbmyrstn",
                 "github": "jmyerston",
                 "website": "https://huggingface.co/spaces/Jacobo/syntax"
             },
-            "category": ["pipeline", "research"],
+            "category": ["pipeline", "research","models"],
             "tags": ["ancient Greek"]
         },
         {
@@ -291,7 +405,7 @@
         },
         {
             "id": "spacypdfreader",
-            "title": "spadypdfreader",
+            "title": "spacypdfreader",
             "category": ["pipeline"],
             "tags": ["PDF"],
             "slogan": "Easy PDF to text to spaCy text extraction in Python.",
@@ -308,7 +422,7 @@
             },
             "code_example": [
                 "import spacy",
-                "from spacypdfreader import pdf_reader",
+                "from spacypdfreader.spacypdfreader import pdf_reader",
                 "",
                 "nlp = spacy.load('en_core_web_sm')",
                 "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
@@ -1555,7 +1669,7 @@
                 "twitter": "allenai_org",
                 "website": "http://allenai.org"
             },
-            "category": ["scientific", "models", "research"]
+            "category": ["scientific", "models", "research", "biomedical"]
         },
         {
             "id": "textacy",
@@ -2625,10 +2739,9 @@
             "description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
             "github": "davidberenstein1957/classy-classification",
             "pip": "classy-classification",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/classy-classification/master/logo.png",
             "code_example": [
                 "import spacy",
-                "import classy_classification",
                 "",
                 "data = {",
                 "    \"furniture\": [\"This text is about chairs.\",",
@@ -2673,14 +2786,13 @@
             "title": "Concise Concepts",
             "slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
             "description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
-            "github": "pandora-intelligence/concise-concepts",
+            "github": "davidberenstein1957/concise-concepts",
             "pip": "concise-concepts",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png",
-            "image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/logo.png",
+            "image": "https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/master/img/example.png",
             "code_example": [
                 "import spacy",
                 "from spacy import displacy",
-                "import concise_concepts",
                 "",
                 "data = {",
                 "    \"fruit\": [\"apple\", \"pear\", \"orange\"],",
@@ -2720,13 +2832,12 @@
             "title": "Crosslingual Coreference",
             "slogan": "One multi-lingual coreference model to rule them all!",
             "description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).",
-            "github": "pandora-intelligence/crosslingual-coreference",
+            "github": "davidberenstein1957/crosslingual-coreference",
             "pip": "crosslingual-coreference",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
-            "image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/logo.png",
+            "image": "https://raw.githubusercontent.com/davidberenstein1957/crosslingual-coreference/master/img/example_total.png",
             "code_example": [
                 "import spacy",
-                "import crosslingual_coreference",
                 "",
                 "text = \"\"\"",
                 "    Do not forget about Momofuku Ando!",
@@ -2767,6 +2878,106 @@
             "tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"],
             "spacy_version": 3
         },
+        {
+            "id": "adeptaugmentations",
+            "title": "Adept Augmentations",
+            "slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
+            "description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
+            "github": "argilla-io/adept-augmentations",
+            "pip": "adept-augmentations",
+            "thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png",
+            "code_example": [
+                "from adept_augmentations import EntitySwapAugmenter",
+                "import spacy",
+                "from spacy.tokens import Doc, DocBin",
+                "nlp = spacy.blank(\"en\")",
+                "",
+                "# Create some example golden data",
+                "example_data = [",
+                "    (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),",
+                "    (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),",
+                "]",
+                "",
+                "# Create a new DocBin",
+                "nlp = spacy.blank(\"en\")",
+                "docs = []",
+                "for entry in example_data:",
+                "    doc = Doc(nlp.vocab, words=entry[0].split())",
+                "    doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]",
+                "    docs.append(doc)",
+                "golden_dataset = DocBin(docs=docs)",
+                "",
+                "# Augment Data",
+                "augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)",
+                "for doc in augmented_dataset.get_docs(nlp.vocab):",
+                "    print(doc.text)",
+                "",
+                "# GitHub is looking at buying U.K. startup for $ 7.5 billion",
+                "# Microsoft is looking at buying U.K. startup for $ 1 billion",
+                "# Microsoft is looking at buying U.K. startup for $ 7.5 billion",
+                "# GitHub is looking at buying U.K. startup for $ 1 billion",
+                "# Microsoft acquires Apple for $ 7.5 billion",
+                "# Apple acquires Microsoft for $ 1 billion",
+                "# Microsoft acquires Microsoft for $ 7.5 billion",
+                "# GitHub acquires GitHub for $ 1 billion"
+            ],
+            "author": "David Berenstein",
+            "author_links": {
+                "github": "davidberenstein1957",
+                "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+            },
+            "category": ["standalone"],
+            "tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
+            "spacy_version": 3
+        },
+        {
+            "id": "spacysetfit",
+            "title": "spaCy-SetFit",
+            "slogan": "An an easy and intuitive approach to use SetFit in combination with spaCy.",
+            "description": "spaCy-SetFit is a Python library that extends spaCy's text categorization capabilities by incorporating SetFit for few-shot classification. It allows you to train a text categorizer using a intuitive dictionary. \n\nThe library integrates with spaCy's pipeline architecture, enabling easy integration and configuration of the text categorizer component. You can provide a training dataset containing inlier and outlier examples, and spaCy-SetFit will use the paraphrase-MiniLM-L3-v2 model for training the text categorizer with SetFit. Once trained, you can use the categorizer to classify new text and obtain category probabilities.",
+            "github": "davidberenstein1957/spacy-setfit",
+            "pip": "spacy-setfit",
+            "thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png",
+            "code_example": [
+            "import spacy",
+            "",
+            "# Create some example data",
+            "train_dataset = {",
+            "    \"inlier\": [",
+            "        \"Text about furniture\",",
+            "        \"Couches, benches and televisions.\",",
+            "        \"I really need to get a new sofa.\"",
+            "    ],",
+            "    \"outlier\": [",
+            "        \"Text about kitchen equipment\",",
+            "        \"This text is about politics\",",
+            "        \"Comments about AI and stuff.\"",
+            "    ]",
+            "}",
+            "",
+            "# Load the spaCy language model:",
+            "nlp = spacy.load(\"en_core_web_sm\")",
+            "",
+            "# Add the \"text_categorizer\" pipeline component to the spaCy model, and configure it with SetFit parameters:",
+            "nlp.add_pipe(\"text_categorizer\", config={",
+            "    \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",",
+            "    \"setfit_trainer_args\": {",
+            "        \"train_dataset\": train_dataset",
+            "    }",
+            "})",
+            "doc = nlp(\"I really need to get a new sofa.\")",
+            "doc.cats",
+            "# {'inlier': 0.902350975129, 'outlier': 0.097649024871}"
+            ],
+            "author": "David Berenstein",
+            "author_links": {
+                "github": "davidberenstein1957",
+                "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+            },
+            "category": ["pipeline"],
+            "tags": ["few-shot", "SetFit", "training"],
+            "spacy_version": 3
+        },
         {
             "id": "blackstone",
             "title": "Blackstone",
@@ -3215,6 +3426,51 @@
             "category": ["pipeline"],
             "tags": ["syllables", "multilingual"]
         },
+        {
+            "id": "sentimental-onix",
+            "title": "Sentimental Onix",
+            "slogan": "Use onnx for sentiment models",
+            "description": "spaCy pipeline component for sentiment analysis using onnx",
+            "github": "sloev/sentimental-onix",
+            "pip": "sentimental-onix",
+            "code_example": [
+                "# Download model:",
+                "#   python -m sentimental_onix download en",
+                "import spacy",
+                "from sentimental_onix import pipeline",
+                "",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "nlp.add_pipe(\"sentencizer\")",
+                "nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
+                "",
+                "sentences = [",
+                "    (sent.text, sent._.sentiment)",
+                "    for doc in nlp.pipe(",
+                "        [",
+                "            \"i hate pasta on tuesdays\",",
+                "            \"i like movies on wednesdays\",",
+                "            \"i find your argument ridiculous\",",
+                "            \"soda with straws are my favorite\",",
+                "        ]",
+                "    )",
+                "    for sent in doc.sents",
+                "]",
+                "",
+                "assert sentences == [",
+                "    (\"i hate pasta on tuesdays\", \"Negative\"),",
+                "    (\"i like movies on wednesdays\", \"Positive\"),",
+                "    (\"i find your argument ridiculous\", \"Negative\"),",
+                "    (\"soda with straws are my favorite\", \"Positive\"),",
+                "]"
+            ],
+            "thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
+            "author": "Johannes Valbjørn",
+            "author_links": {
+                "github": "sloev"
+            },
+            "category": ["pipeline"],
+            "tags": ["sentiment", "english"]
+        },
         {
             "id": "gobbli",
             "title": "gobbli",
@@ -4074,6 +4330,68 @@
             },
             "category": ["pipeline", "research"],
             "tags": ["Thai"]
+        },
+        {
+            "id": "vetiver",
+            "title": "Vetiver",
+            "slogan": "Version, share, deploy, and monitor models.",
+            "description": "The goal of vetiver is to provide fluent tooling to version, deploy, and monitor a trained model. Functions handle creating model objects, versioning models, predicting from a remote API endpoint, deploying Dockerfiles, and more.",
+            "github": "rstudio/vetiver-python",
+            "pip": "vetiver",
+            "code_example": [
+                "import spacy",
+                "from vetiver import VetiverModel, VetiverAPI",
+                "",
+                "# If you use this model, you'll need to download it first:",
+                "# python -m spacy download en_core_web_md",
+                "nlp = spacy.load('en_core_web_md')",
+                "# Create deployable model object with your nlp Language object",
+                "v = VetiverModel(nlp, model_name = 'my_model')",
+                "# Try out your API endpoint locally",
+                "VetiverAPI(v).run()"
+            ],
+            "code_language": "python",
+            "url": "https://vetiver.rstudio.com/",
+            "thumb": "https://raw.githubusercontent.com/rstudio/vetiver-python/main/docs/figures/square-logo.svg",
+            "author": "Posit, PBC",
+            "author_links": {
+                "twitter": "posit_pbc",
+                "github": "rstudio",
+                "website": "https://posit.co/"
+            },
+            "category": ["apis", "standalone"],
+            "tags": ["apis", "deployment"]
+        },
+        {
+            "id": "span_marker",
+            "title": "SpanMarker",
+            "slogan": "Effortless state-of-the-art NER in spaCy",
+            "description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.",
+            "github": "tomaarsen/SpanMarkerNER",
+            "pip": "span_marker",
+            "code_example": [
+                "import spacy",
+                "",
+                "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
+                "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
+                "",
+                "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
+                "Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\",
+                "death in 30 BCE.\"\"\"",
+                "doc = nlp(text)",
+                "print([(entity, entity.label_) for entity in doc.ents])",
+                "# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),",
+                "# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]"
+            ],
+            "code_language": "python",
+            "url": "https://tomaarsen.github.io/SpanMarkerNER",
+            "author": "Tom Aarsen",
+            "author_links": {
+                "github": "tomaarsen",
+                "website": "https://www.linkedin.com/in/tomaarsen"
+            },
+            "category": ["pipeline", "standalone", "scientific"],
+            "tags": ["ner"]
         }
     ],
 
diff --git a/website/public/images/spacy-extension-demo.gif b/website/public/images/spacy-extension-demo.gif
new file mode 100644
index 000000000..a857bbe2d
Binary files /dev/null and b/website/public/images/spacy-extension-demo.gif differ
diff --git a/website/src/components/code.js b/website/src/components/code.js
index 09c2fabfc..e733dba77 100644
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@@ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js'
 import 'prismjs/components/prism-markdown.min.js'
 import 'prismjs/components/prism-python.min.js'
 import 'prismjs/components/prism-yaml.min.js'
+import 'prismjs/components/prism-docker.min.js'
+import 'prismjs/components/prism-r.min.js'
 
 import { isString } from './util'
 import Link, { OptionalLink } from './link'
@@ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => {
         return handlePromot({ lineFlat, prompt })
     }
 
-    return lang === 'none' || !lineFlat ? (
+    return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? (
         lineFlat
     ) : (
         <span
diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js
index 160e5a778..2b5bfb5ba 100644
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@@ -215,15 +215,17 @@ const Quickstart = ({
                     }
                 )}
                 <pre className={classes['code']}>
-                    <code
-                        className={classNames(classes['results'], {
-                            [classes['small']]: !!small,
-                            [`language-${codeLang}`]: !!codeLang,
-                        })}
-                        data-quickstart-results=""
-                        ref={contentRef}
-                    >
-                        {Children.toArray(children).flat().filter(isRelevant)}
+                    <code>
+                        <div
+                            className={classNames(classes['results'], {
+                                [classes['small']]: !!small,
+                                [`language-${codeLang}`]: !!codeLang,
+                            })}
+                            data-quickstart-results=""
+                            ref={contentRef}
+                        >
+                            {Children.toArray(children).flat().filter(isRelevant)}
+                        </div>
                     </code>
 
                     <menu className={classes['menu']}>
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 4c10e09c5..227b25be8 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -57,15 +57,9 @@ const AlertSpace = ({ nightly, legacy }) => {
     )
 }
 
-// const navAlert = (
-//     <Link to="/usage/v3-5" noLinkLayout>
-//         <strong>💥 Out now:</strong> spaCy v3.5
-//     </Link>
-// )
-
 const navAlert = (
-    <Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
-        <strong>💥 Take the user survey!</strong>
+    <Link to="/usage/v3-5" noLinkLayout>
+        <strong>💥 Out now:</strong> spaCy v3.5
     </Link>
 )