Merge branch 'master' into feature/visualisation

2025-08-02 19:30:19 +03:00 · 2023-01-25 15:18:47 +01:00 · 2023-01-25 15:18:47 +01:00 · fd66bce8c1
commit fd66bce8c1
parent 3557c613b1 f9e020dd67
522 changed files with 47564 additions and 38729 deletions
--- a/.github/ISSUE_TEMPLATE/01_bugs.md
+++ b/.github/ISSUE_TEMPLATE/01_bugs.md
@ -4,11 +4,13 @@ about: Use this template if you came across a bug or unexpected behaviour differ

 ---

+<!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
+
 ## How to reproduce the behaviour
 <!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->

 ## Your Environment
-<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
+<!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
 * Operating System:
 * Python Version Used:
 * spaCy Version Used:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,8 +1,5 @@
 blank_issues_enabled: false
 contact_links:
-  - name: ⚠️ Python 3.10 Support
-    url: https://github.com/explosion/spaCy/discussions/9418
-    about: Python 3.10 wheels haven't been released yet, see the link for details.
  - name: 🗯 Discussions Forum
    url: https://github.com/explosion/spaCy/discussions
    about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,68 +1,56 @@
 parameters:
  python_version: ''
-  architecture: ''
-  prefix: ''
-  gpu: false
-  num_build_jobs: 1
+  architecture: 'x64'
+  num_build_jobs: 2

 steps:
  - task: UsePythonVersion@0
    inputs:
      versionSpec: ${{ parameters.python_version }}
      architecture: ${{ parameters.architecture }}
+      allowUnstable: true

  - bash: |
      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
    displayName: 'Set variables'

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U build pip setuptools
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"

  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
-    displayName: "Compile and build sdist"
+      python -m build --sdist
+    displayName: "Build sdist"

-  - script: python -m mypy spacy
+  - script: |
+      python -m mypy spacy
    displayName: 'Run mypy'
-    condition: ne(variables['python_version'], '3.10')
+    condition: ne(variables['python_version'], '3.6')

  - task: DeleteFiles@1
    inputs:
      contents: "spacy"
    displayName: "Delete source directory"

+  - task: DeleteFiles@1
+    inputs:
+      contents: "*.egg-info"
+    displayName: "Delete egg-info directory"
+
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip freeze > installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"

  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
    displayName: "Install from sdist"

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
-      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
-    displayName: "Install GPU requirements"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests"
-    condition: eq(${{ parameters.gpu }}, false)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
-    displayName: "Run GPU tests"
-    condition: eq(${{ parameters.gpu }}, true)
+      python -W error -c "import spacy"
+    displayName: "Test import"

  - script: |
      python -m spacy download ca_core_news_sm
@ -71,6 +59,11 @@ steps:
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')

+  - script: |
+      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+    displayName: 'Test no warnings on load (#11713)'
+    condition: eq(variables['python_version'], '3.8')
+
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
    displayName: 'Test convert CLI'
@ -105,13 +98,22 @@ steps:
    displayName: 'Test assemble CLI vectors warning'
    condition: eq(variables['python_version'], '3.8')

+  - script: |
+      python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+
+  - script: |
+      python -m pip install 'spacy[apple]'
+      python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
+
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')

-  - script: |
-      ${{ parameters.prefix }} python -m pip install thinc-apple-ops
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
--- a/.github/contributors/Lucaterre.md
+++ b/.github/contributors/Lucaterre.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry         |
+|------------------------------- |---------------|
+| Name                           | Lucas Terriel |
+| Company name (if applicable)   |               |
+| Title or role (if applicable)  |               |
+| Date                           | 2022-06-20    |
+| GitHub username                | Lucaterre     |
+| Website (optional)             |               |
--- a/.github/contributors/fonfonx.md
+++ b/.github/contributors/fonfonx.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Xavier Fontaine      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2022-04-13           |
+| GitHub username                | fonfonx              |
+| Website (optional)             |                      |
--- a/.github/no-response.yml
+++ b/.github/no-response.yml
@ -1,13 +0,0 @@
-# Configuration for probot-no-response - https://github.com/probot/no-response
-
-# Number of days of inactivity before an Issue is closed for lack of response
-daysUntilClose: 14
-# Label requiring a response
-responseRequiredLabel: more-info-needed
-# Comment to post when closing an Issue for lack of response. Set to `false` to disable
-closeComment: >
-  This issue has been automatically closed because there has been no response
-  to a request for more information from the original author. With only the
-  information that is currently in the issue, there's not enough information
-  to take action. If you're the original author, feel free to reopen the issue
-  if you have or find the answers needed to investigate further.
--- a/.github/spacy_universe_alert.py
+++ b/.github/spacy_universe_alert.py
@ -0,0 +1,67 @@
+import os
+import sys
+import json
+from datetime import datetime
+
+from slack_sdk.web.client import WebClient
+
+CHANNEL = "#alerts-universe"
+SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
+DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+
+client = WebClient(SLACK_TOKEN)
+github_context = json.loads(sys.argv[1])
+
+event = github_context['event']
+pr_title = event['pull_request']["title"]
+pr_link = event['pull_request']["patch_url"].replace(".patch", "")
+pr_author_url = event['sender']["html_url"]
+pr_author_name = pr_author_url.rsplit('/')[-1]
+pr_created_at_dt = datetime.strptime(
+    event['pull_request']["created_at"],
+    DATETIME_FORMAT
+)
+pr_created_at = pr_created_at_dt.strftime("%c")
+pr_updated_at_dt = datetime.strptime(
+    event['pull_request']["updated_at"],
+    DATETIME_FORMAT
+)
+pr_updated_at = pr_updated_at_dt.strftime("%c")
+
+blocks = [
+    {
+      "type": "section",
+      "text": {
+        "type": "mrkdwn",
+        "text": "📣 New spaCy Universe Project Alert ✨"
+      }
+    },
+    {
+      "type": "section",
+      "fields": [
+        {
+          "type": "mrkdwn",
+          "text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
+        },
+        {
+          "type": "mrkdwn",
+          "text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
+        },
+        {
+          "type": "mrkdwn",
+          "text": f"*Created at:*\n {pr_created_at}"
+        },
+        {
+          "type": "mrkdwn",
+          "text": f"*Last Updated:*\n {pr_updated_at}"
+        }
+      ]
+    }
+  ]
+
+
+client.chat_postMessage(
+    channel=CHANNEL,
+    text="spaCy universe project PR alert",
+    blocks=blocks
+)
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -12,10 +12,10 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
      - run: pip install black
      - name: Auto-format code if needed
        run: black spacy
@ -23,10 +23,11 @@ jobs:
      # code and makes GitHub think the action failed
      - name: Check for modified files
        id: git-check
-        run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
      - name: Create Pull Request
        if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v3
+        uses: peter-evans/create-pull-request@v4
        with:
            title: Auto-format code with black
            labels: meta
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -8,14 +8,14 @@ on:

 jobs:
  explosion-bot:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
    steps:
      - name: Dump GitHub context
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v1
-      - uses: actions/setup-python@v1
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
          pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
@ -23,5 +23,5 @@ jobs:
        env:
          INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
          INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
-          ENABLED_COMMANDS: "test_gpu,test_slow"
+          ENABLED_COMMANDS: "test_gpu,test_slow,test_slow_gpu"
          ALLOWED_TEAMS: "spaCy"
--- a/.github/workflows/gputests.yml
+++ b/.github/workflows/gputests.yml
@ -10,6 +10,7 @@ jobs:
      fail-fast: false
      matrix:
        branch: [master, v4]
+    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - name: Trigger buildkite build
--- a/.github/workflows/issue-manager.yml
+++ b/.github/workflows/issue-manager.yml
@ -15,7 +15,7 @@ jobs:
  issue-manager:
    runs-on: ubuntu-latest
    steps:
-      - uses: tiangolo/issue-manager@0.2.1
+      - uses: tiangolo/issue-manager@0.4.0
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          config: >
@ -25,5 +25,11 @@ jobs:
                "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
                "remove_label_on_comment": true,
                "remove_label_on_close": true
+              },
+              "more-info-needed": {
+                "delay": "P7D",
+                "message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
+                "remove_label_on_comment": true,
+                "remove_label_on_close": true
              }
            }
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -15,7 +15,7 @@ jobs:
  action:
    runs-on: ubuntu-latest
    steps:
-      - uses: dessant/lock-threads@v3
+      - uses: dessant/lock-threads@v4
        with:
          process-only: 'issues'
          issue-inactive-days: '30'
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@ -10,10 +10,11 @@ jobs:
      fail-fast: false
      matrix:
        branch: [master, v4]
+    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
@ -22,9 +23,9 @@ jobs:
          today=$(date '+%Y-%m-%d %H:%M:%S')
          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
          if git log --after="$yesterday" --before="$today" | grep commit ; then
-            echo "::set-output name=run_tests::true"
+            echo run_tests=true >> $GITHUB_OUTPUT
          else
-            echo "::set-output name=run_tests::false"
+            echo run_tests=false >> $GITHUB_OUTPUT
          fi

      - name: Trigger buildkite build
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -0,0 +1,32 @@
+name: spaCy universe project alert
+
+on:
+  pull_request_target:
+    paths:
+      - "website/meta/universe.json"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Dump GitHub context
+        env:
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+          PR_NUMBER: ${{github.event.number}}
+        run: |
+          echo "$GITHUB_CONTEXT"
+
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install Bernadette app dependency and send an alert
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+          CHANNEL: "#alerts-universe"
+        run: |
+          pip install slack-sdk==3.17.2 aiohttp==3.8.1
+          echo "$CHANNEL"
+          python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"
--- a/.gitignore
+++ b/.gitignore
@ -10,20 +10,11 @@ spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt

-# Website
-website/.cache/
-website/public/
-website/node_modules
-website/.npm
-website/logs
-*.log
-npm-debug.log*
-quickstart-training-generator.js
-
 # Cython / C extensions
 cythonize.json
 spacy/*.html
 *.cpp
+*.c
 *.so

 # Vim / VSCode / editors
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,11 +1,12 @@
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 21.6b0
+    rev: 22.3.0
    hooks:
    - id: black
      language_version: python3.7
-   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+      additional_dependencies: ['click==8.0.4']
+-   repo: https://github.com/pycqa/flake8
+    rev: 5.0.4
    hooks:
    - id: flake8
      args:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -144,7 +144,7 @@ Changes to `.py` files will be effective immediately.

 When fixing a bug, first create an
 [issue](https://github.com/explosion/spaCy/issues) if one does not already
-exist.  The description text can be very short – we don't want to make this too
+exist. The description text can be very short – we don't want to make this too
 bureaucratic.

 Next, add a test to the relevant file in the
@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
 (i.e. overwritten) dictionary keys. If your code was formatted with `black`
 (see above), you shouldn't see any formatting-related warnings.

-The [`.flake8`](.flake8) config defines the configuration we use for this
+The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
 codebase. For example, we're not super strict about the line length, and we're
 excluding very large files like lemmatization and tokenizer exception tables.

@ -271,7 +271,8 @@ except:  # noqa: E722

 ### Python conventions

-All Python code must be written **compatible with Python 3.6+**.
+All Python code must be written **compatible with Python 3.6+**. More detailed
+code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).

 #### I/O and handling paths

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,4 @@
-recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
+recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
 include LICENSE
 include README.md
 include pyproject.toml
--- a/README.md
+++ b/README.md
@ -8,15 +8,15 @@ be used in real products.

 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
 state-of-the-art speed and **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
 production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the MIT license.
+open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).

-💫 **Version 3.2 out now!**
+💫 **Version 3.5 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
 | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
 | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
+| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |

 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md

+
 ## 💬 Where to ask questions

 The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
@ -79,7 +81,7 @@ more people can benefit from it.

 ## Features

- Support for **60+ languages**
+- Support for **70+ languages**
 - **Trained pipelines** for different languages and tasks
 - Multi-task learning with pretrained **transformers** like BERT
 - Support for pretrained **word vectors** and embeddings
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -31,8 +31,8 @@ jobs:
        inputs:
          versionSpec: "3.7"
      - script: |
-          pip install flake8==3.9.2
-          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
+          pip install flake8==5.0.4
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
        displayName: "flake8"

  - job: "Test"
@ -41,7 +41,7 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-latest"
+          imageName: "ubuntu-20.04"
          python.version: "3.6"
        #        Python36Windows:
        #          imageName: "windows-latest"
@ -50,7 +50,7 @@ jobs:
        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-latest"
+        #          imageName: "ubuntu-20.04"
        #          python.version: "3.7"
        Python37Windows:
          imageName: "windows-latest"
@ -76,15 +76,24 @@ jobs:
        #        Python39Mac:
        #          imageName: "macos-latest"
        #          python.version: "3.9"
-        Python310Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.10"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
        Python310Windows:
          imageName: "windows-latest"
          python.version: "3.10"
-        Python310Mac:
-          imageName: "macos-latest"
-          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11'
      maxParallel: 4
    pool:
      vmImage: $(imageName)
@ -92,20 +101,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
-          architecture: 'x64'
-
-#  - job: "TestGPU"
-#    dependsOn: "Validate"
-#    strategy:
-#      matrix:
-#        Python38LinuxX64_GPU:
-#          python.version: '3.8'
-#    pool:
-#      name: "LinuxX64_GPU"
-#    steps:
-#      - template: .github/azure-steps.yml
-#        parameters:
-#          python_version: '$(python.version)'
-#          architecture: 'x64'
-#          gpu: true
-#          num_build_jobs: 24
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,6 +1,9 @@
 # build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7'
-numpy==1.17.3; python_version=='3.8'
+numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
+numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
+numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
+numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
 numpy==1.21.3; python_version=='3.10'
-numpy; python_version>='3.11'
+numpy==1.23.2; python_version=='3.11'
+numpy; python_version>='3.12'
--- a/extra/DEVELOPER_DOCS/Code
+++ b/extra/DEVELOPER_DOCS/Code
@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho

 ## Type hints

-We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
+We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.

 If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.

@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
    return callback
 ```

+For typing variables, we prefer the explicit format.
+
+```diff
+- var = value    # type: Type
+ var: Type = value
+```
+
 For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).

 ```python
@ -184,6 +191,8 @@ def load_model(name: str) -> "Language":
    ...
 ```

+Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
+
 ## Structuring logic

 ### Positional and keyword arguments
@ -268,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
 +     return [v.strip() for v in value.split(",")]
 ```

+### Numeric comparisons
+
+For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
+apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
+
+One exception to this rule is the ternary case. With a chain like
+
+```python
+if value >= 0 and value < max:
+    ...
+```
+
+it's fine to rewrite this to the shorter form
+
+```python
+if 0 <= value < max:
+    ...
+```
+
+even though this requires the usage of the `<=` operator.
+
 ### Iteration and comprehensions

 We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
@ -448,6 +478,10 @@ Regression tests are tests that refer to bugs reported in specific issues. They

 The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.

+### Testing Cython Code
+
+If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
+
 ### Constructing objects and state

 Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.
--- a/extra/DEVELOPER_DOCS/ExplosionBot.md
+++ b/extra/DEVELOPER_DOCS/ExplosionBot.md
@ -0,0 +1,56 @@
+# Explosion-bot
+
+Explosion-bot is a robot that can be invoked to help with running particular test commands.
+
+## Permissions
+
+Only maintainers have permissions to summon explosion-bot. Each of the open source repos that use explosion-bot has its own team(s) of maintainers, and only github users who are members of those teams can successfully run bot commands.
+
+## Running robot commands
+
+To summon the robot, write a github comment on the issue/PR you wish to test. The comment must be in the following format:
+
+```
+@explosion-bot please test_gpu
+```
+
+Some things to note:
+
+- The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
+- The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
+- The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
+
+### Examples
+
+- Execute spaCy slow GPU tests with a custom thinc branch from a spaCy PR:
+
+  ```
+  @explosion-bot please test_slow_gpu --thinc-branch <branch_name>
+  ```
+
+  `branch_name` can either be a named branch, e.g: `develop`, or an unmerged PR, e.g: `refs/pull/<pr_number>/head`.
+
+- Execute spaCy Transformers GPU tests from a spaCy PR:
+
+  ```
+  @explosion-bot please test_gpu --run-on spacy-transformers --run-on-branch master --spacy-branch current_pr
+  ```
+
+  This will launch the GPU pipeline for the `spacy-transformers` repo on its `master` branch, using the current spaCy PR's branch to build spaCy. The name of the repository passed to `--run-on` is case-sensitive, e.g: use `spaCy` instead of `spacy`.
+
+- General info about supported commands.
+
+  ```
+  @explosion-bot please info
+  ```
+
+- Help text for a specific command
+  ```
+  @explosion-bot please <command> --help
+  ```
+
+## Troubleshooting
+
+If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml).
+
+For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.
--- a/extra/DEVELOPER_DOCS/Satellite
+++ b/extra/DEVELOPER_DOCS/Satellite
@ -0,0 +1,82 @@
+# spaCy Satellite Packages
+
+This is a list of all the active repos relevant to spaCy besides the main one, with short descriptions, history, and current status. Archived repos will not be covered.
+
+## Always Included in spaCy
+
+These packages are always pulled in when you install spaCy. Most of them are direct dependencies, but some are transitive dependencies through other packages.
+
+- [spacy-legacy](https://github.com/explosion/spacy-legacy): When an architecture in spaCy changes enough to get a new version, the old version is frozen and moved to spacy-legacy. This allows us to keep the core library slim while also preserving backwards compatability.
+- [thinc](https://github.com/explosion/thinc): Thinc is the machine learning library that powers trainable components in spaCy. It wraps backends like Numpy, PyTorch, and Tensorflow to provide a functional interface for specifying architectures.
+- [catalogue](https://github.com/explosion/catalogue): Small library for adding function registries, like those used for model architectures in spaCy.
+- [confection](https://github.com/explosion/confection): This library contains the functionality for config parsing that was formerly contained directly in Thinc.
+- [spacy-loggers](https://github.com/explosion/spacy-loggers): Contains loggers beyond the default logger available in spaCy&#39;s core code base. This includes loggers integrated with third-party services, which may differ in release cadence from spaCy itself.
+- [wasabi](https://github.com/explosion/wasabi): A command line formatting library, used for terminal output in spaCy.
+- [srsly](https://github.com/explosion/srsly): A wrapper that vendors several serialization libraries for spaCy. Includes parsers for JSON, JSONL, MessagePack, (extended) Pickle, and YAML.
+- [preshed](https://github.com/explosion/preshed): A Cython library for low-level data structures like hash maps, used for memory efficient data storage.
+- [cython-blis](https://github.com/explosion/cython-blis): Fast matrix multiplication using BLIS without depending on system libraries. Required by Thinc, rather than spaCy directly.
+- [murmurhash](https://github.com/explosion/murmurhash): A wrapper library for a C++ murmurhash implementation, used for string IDs in spaCy and preshed.
+- [cymem](https://github.com/explosion/cymem): A small library for RAII-style memory management in Cython. 
+
+## Optional Extensions for spaCy
+
+These are repos that can be used by spaCy but aren&#39;t part of a default installation. Many of these are wrappers to integrate various kinds of third-party libraries.
+
+- [spacy-transformers](https://github.com/explosion/spacy-transformers): A wrapper for the [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) library, this handles the extensive conversion necessary to coordinate spaCy&#39;s powerful `Doc` representation, training pipeline, and the Transformer embeddings. When released, this was known as `spacy-pytorch-transformers`, but it changed to the current name when HuggingFace update the name of their library as well.
+- [spacy-huggingface-hub](https://github.com/explosion/spacy-huggingface-hub): This package has a CLI script for uploading a packaged spaCy pipeline (created with `spacy package`) to the [Hugging Face Hub](https://huggingface.co/models).
+- [spacy-alignments](https://github.com/explosion/spacy-alignments): A wrapper for the tokenizations library (mentioned below) with a modified build system to simplify cross-platform wheel creation. Used in spacy-transformers for aligning spaCy and HuggingFace tokenizations.
+- [spacy-experimental](https://github.com/explosion/spacy-experimental): Experimental components that are not quite ready for inclusion in the main spaCy library. Usually there are unresolved questions around their APIs, so the experimental library allows us to expose them to the community for feedback before fully integrating them. 
+- [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data): A repository of linguistic data, such as lemmas, that takes up a lot of disk space. Originally created to reduce the size of the spaCy core library. This is mainly useful if you want the data included but aren&#39;t using a pretrained pipeline; for the affected languages, the relevant data is included in pretrained pipelines directly.
+- [coreferee](https://github.com/explosion/coreferee): Coreference resolution for English, French, German and Polish, optimised for limited training data and easily extensible for further languages. Used as a spaCy pipeline component.
+- [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford&#39;s Stanza library in spaCy. 
+- [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
+- [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
+- [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple&#39;s native libraries for improved performance.
+- [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
+- [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.
+
+## Prodigy
+
+[Prodigy](https://prodi.gy) is Explosion&#39;s easy to use and highly customizable tool for annotating data. Prodigy itself requires a license, but the repos below contain documentation, examples, and editor or notebook integrations.
+
+- [prodigy-recipes](https://github.com/explosion/prodigy-recipes): Sample recipes for Prodigy, along with notebooks and other examples of usage.
+- [vscode-prodigy](https://github.com/explosion/vscode-prodigy): A VS Code extension that lets you run Prodigy inside VS Code.
+- [jupyterlab-prodigy](https://github.com/explosion/jupyterlab-prodigy): An extension for JupyterLab that lets you run Prodigy inside JupyterLab.
+
+## Independent Tools or Projects
+
+These are tools that may be related to or use spaCy, but are functional independent projects in their own right as well.
+
+- [floret](https://github.com/explosion/floret): A modification of fastText to use Bloom Embeddings. Can be used to add vectors with subword features to spaCy, and also works independently in the same manner as fastText.
+- [sense2vec](https://github.com/explosion/sense2vec): A library to make embeddings of noun phrases or words coupled with their part of speech. This library uses spaCy.
+- [spacy-vectors-builder](https://github.com/explosion/spacy-vectors-builder): This is a spaCy project that builds vectors using floret and a lot of input text. It handles downloading the input data as well as the actual building of vectors.
+- [holmes-extractor](https://github.com/explosion/holmes-extractor): Information extraction from English and German texts based on predicate logic. Uses spaCy.
+- [healthsea](https://github.com/explosion/healthsea): Healthsea is a project to extract information from comments about health supplements. Structurally, it&#39;s a self-contained, large spaCy project.
+- [spacy-pkuseg](https://github.com/explosion/spacy-pkuseg): A fork of the pkuseg Chinese tokenizer. Used for Chinese support in spaCy, but also works independently.
+- [ml-datasets](https://github.com/explosion/ml-datasets): This repo includes loaders for several standard machine learning datasets, like MNIST or WikiNER, and has historically been used in spaCy example code and documentation.
+
+## Documentation and Informational Repos
+
+These repos are used to support the spaCy docs or otherwise present information about spaCy or other Explosion projects.
+
+- [projects](https://github.com/explosion/projects): The projects repo is used to show detailed examples of spaCy usage. Individual projects can be checked out using the spaCy command line tool, rather than checking out the projects repo directly.
+- [spacy-course](https://github.com/explosion/spacy-course): Home to the interactive spaCy course for learning about how to use the library and some basic NLP principles.
+- [spacy-io-binder](https://github.com/explosion/spacy-io-binder): Home to the notebooks used for interactive examples in the documentation.
+
+## Organizational / Meta
+
+These repos are used for organizing data around spaCy, but are not something an end user would need to install as part of using the library.
+
+- [spacy-models](https://github.com/explosion/spacy-models): This repo contains metadata (but not training data) for all the spaCy models. This includes information about where their training data came from, version compatability, and performance information. It also includes tests for the model packages, and the built models are hosted as releases of this repo.
+- [wheelwright](https://github.com/explosion/wheelwright): A tool for automating our PyPI builds and releases.
+- [ec2buildwheel](https://github.com/explosion/ec2buildwheel): A small project that allows you to build Python packages in the manner of cibuildwheel, but on any EC2 image. Used by wheelwright.
+
+## Other
+
+Repos that don&#39;t fit in any of the above categories.
+
+- [blis](https://github.com/explosion/blis): A fork of the official BLIS library. The main branch is not updated, but work continues in various branches. This is used for cython-blis.
+- [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
+- [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
+- [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.
+
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@ -127,3 +127,34 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+
+polyleven
+---------
+
+* Files: spacy/matcher/polyleven.c
+
+MIT License
+
+Copyright (c) 2021 Fujimoto Seiji <fujimoto@ceptord.net>
+Copyright (c) 2021 Max Bachmann <kontakt@maxbachmann.de>
+Copyright (c) 2022 Nick Mazuk
+Copyright (c) 2022 Michael Weiss <code@mweiss.ch>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,9 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.12,<8.1.0",
-    "blis>=0.4.0,<0.8.0",
-    "pathy",
+    "thinc>=8.1.0,<8.2.0",
    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,38 +1,40 @@
 # Our libraries
-spacy-legacy>=3.0.8,<3.1.0
+spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.12,<8.1.0
-blis>=0.4.0,<0.8.0
+thinc>=8.1.0,<8.2.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.9.0,<1.1.0
-srsly>=2.4.1,<3.0.0
+wasabi>=0.9.1,<1.2.0
+srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.5.0
-pathy>=0.3.5
+typer>=0.3.0,<0.8.0
+pathy>=0.10.0
+smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
-pytest>=5.2.0
+pytest>=5.2.0,!=7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-flake8>=3.8.0,<3.10.0
+flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy==0.910
+mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
+types-setuptools>=57.0.0
 types-requests
+types-setuptools>=57.0.0
 black>=22.0,<23.0
--- a/setup.cfg
+++ b/setup.cfg
@ -22,6 +22,7 @@ classifiers =
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -38,31 +39,31 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.12,<8.1.0
+    thinc>=8.1.0,<8.2.0
 install_requires =
    # Our libraries
-    spacy-legacy>=3.0.8,<3.1.0
+    spacy-legacy>=3.0.11,<3.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.12,<8.1.0
-    blis>=0.4.0,<0.8.0
-    wasabi>=0.9.0,<1.1.0
-    srsly>=2.4.1,<3.0.0
+    thinc>=8.1.0,<8.2.0
+    wasabi>=0.9.1,<1.2.0
+    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
-    typer>=0.3.0,<0.5.0
-    pathy>=0.3.5
    # Third-party dependencies
+    typer>=0.3.0,<0.8.0
+    pathy>=0.10.0
+    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
    jinja2
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
+    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
@ -73,45 +74,53 @@ console_scripts =
 lookups =
    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.1.2,<1.2.0
+    spacy_transformers>=1.1.2,<1.3.0
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<11.0.0
+    cupy>=5.0.0b4,<12.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<11.0.0
+    cupy-cuda80>=5.0.0b4,<12.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<11.0.0
+    cupy-cuda90>=5.0.0b4,<12.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<11.0.0
+    cupy-cuda91>=5.0.0b4,<12.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<11.0.0
+    cupy-cuda92>=5.0.0b4,<12.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<11.0.0
+    cupy-cuda100>=5.0.0b4,<12.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<11.0.0
+    cupy-cuda101>=5.0.0b4,<12.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<11.0.0
+    cupy-cuda102>=5.0.0b4,<12.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<11.0.0
+    cupy-cuda110>=5.0.0b4,<12.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<11.0.0
+    cupy-cuda111>=5.0.0b4,<12.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<11.0.0
+    cupy-cuda112>=5.0.0b4,<12.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<11.0.0
+    cupy-cuda113>=5.0.0b4,<12.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<11.0.0
+    cupy-cuda114>=5.0.0b4,<12.0.0
 cuda115 =
-    cupy-cuda115>=5.0.0b4,<11.0.0
+    cupy-cuda115>=5.0.0b4,<12.0.0
+cuda116 =
+    cupy-cuda116>=5.0.0b4,<12.0.0
+cuda117 =
+    cupy-cuda117>=5.0.0b4,<12.0.0
+cuda11x =
+    cupy-cuda11x>=11.0.0,<12.0.0
+cuda-autodetect =
+    cupy-wheel>=11.0.0,<12.0.0
 apple =
-    thinc-apple-ops>=0.0.4,<1.0.0
+    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.5.2,!=0.6.1
    sudachidict_core>=20211220
 ko =
-    natto-py==0.9.0
+    natto-py>=0.9.0
 th =
    pythainlp>=2.0

--- a/setup.py
+++ b/setup.py
@ -23,16 +23,20 @@ Options.docstrings = True

 PACKAGES = find_packages()
 MOD_NAMES = [
+    "spacy.training.alignment_array",
    "spacy.training.example",
    "spacy.parts_of_speech",
    "spacy.strings",
    "spacy.lexeme",
    "spacy.vocab",
    "spacy.attrs",
-    "spacy.kb",
+    "spacy.kb.candidate",
+    "spacy.kb.kb",
+    "spacy.kb.kb_in_memory",
    "spacy.ml.parser_model",
    "spacy.morphology",
    "spacy.pipeline.dep_parser",
+    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
    "spacy.pipeline.multitask",
    "spacy.pipeline.ner",
@ -124,6 +128,8 @@ class build_ext_options:

 class build_ext_subclass(build_ext, build_ext_options):
    def build_extensions(self):
+        if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
+            self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
        build_ext_options.build_options(self)
        build_ext.build_extensions(self)

@ -201,10 +207,25 @@ def setup_package():
        get_python_inc(plat_specific=True),
    ]
    ext_modules = []
+    ext_modules.append(
+        Extension(
+            "spacy.matcher.levenshtein",
+            [
+                "spacy/matcher/levenshtein.pyx",
+                "spacy/matcher/polyleven.c",
+            ],
+            language="c",
+            include_dirs=include_dirs,
+        )
+    )
    for name in MOD_NAMES:
        mod_path = name.replace(".", "/") + ".pyx"
        ext = Extension(
-            name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
+            name,
+            [mod_path],
+            language="c++",
+            include_dirs=include_dirs,
+            extra_compile_args=["-std=c++11"],
        )
        ext_modules.append(ext)
    print("Cythonizing sources")
--- a/spacy/init.py
+++ b/spacy/init.py
@ -31,25 +31,33 @@ def load(
    name: Union[str, Path],
    *,
    vocab: Union[Vocab, bool] = True,
-    disable: Iterable[str] = util.SimpleFrozenList(),
-    exclude: Iterable[str] = util.SimpleFrozenList(),
+    disable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
+    enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
+    exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
 ) -> Language:
    """Load a spaCy model from an installed package or a local path.

    name (str): Package name or model path.
    vocab (Vocab): A Vocab object. If True, a vocab is created.
-    disable (Iterable[str]): Names of pipeline components to disable. Disabled
+    disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
-    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
+    enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
+        pipes will be disabled (but can be enabled later using nlp.enable_pipe).
+    exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
    return util.load_model(
-        name, vocab=vocab, disable=disable, exclude=exclude, config=config
+        name,
+        vocab=vocab,
+        disable=disable,
+        enable=enable,
+        exclude=exclude,
+        config=config,
    )


--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.2"
+__version__ = "3.5.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,6 +4,7 @@ from ._util import app, setup_cli  # noqa: F401

 # These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 # are registered automatically and won't have to be imported here.
+from .benchmark_speed import benchmark_speed_cli  # noqa: F401
 from .download import download  # noqa: F401
 from .info import info  # noqa: F401
 from .package import package  # noqa: F401
@ -14,7 +15,9 @@ from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
+from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
+from .apply import apply  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
@ -26,6 +29,7 @@ from .project.dvc import project_update_dvc  # noqa: F401
 from .project.push import project_push  # noqa: F401
 from .project.pull import project_pull  # noqa: F401
 from .project.document import project_document  # noqa: F401
+from .find_threshold import find_threshold  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -12,7 +12,7 @@ from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
 from thinc.api import Config, ConfigValidationError, require_gpu
-from thinc.util import has_cupy, gpu_is_available
+from thinc.util import gpu_is_available
 from configparser import InterpolationError
 import os

@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about

 if TYPE_CHECKING:
-    from pathy import Pathy  # noqa: F401
+    from pathy import FluidPath  # noqa: F401


 SDIST_SUFFIX = ".tar.gz"
@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
 commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
+BENCHMARK_HELP = """Commands for benchmarking pipelines."""
 INIT_HELP = """Commands for initializing configs and pipeline packages."""

 # Wrappers for Typer's annotations. Initially created to set defaults and to
@ -54,12 +55,14 @@ Arg = typer.Argument
 Opt = typer.Option

 app = typer.Typer(name=NAME, help=HELP)
+benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)

 app.add_typer(project_cli)
 app.add_typer(debug_cli)
+app.add_typer(benchmark_cli)
 app.add_typer(init_cli)


@ -158,15 +161,15 @@ def load_project_config(
        sys.exit(1)
    validate_project_version(config)
    validate_project_commands(config)
+    if interpolate:
+        err = f"{PROJECT_FILE} validation error"
+        with show_validation_error(title=err, hint_fill=False):
+            config = substitute_project_variables(config, overrides)
    # Make sure directories defined in config exist
    for subdir in config.get("directories", []):
        dir_path = path / subdir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
-    if interpolate:
-        err = f"{PROJECT_FILE} validation error"
-        with show_validation_error(title=err, hint_fill=False):
-            config = substitute_project_variables(config, overrides)
    return config


@ -331,7 +334,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)


-def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
+def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
    """Upload a file.

    src (Path): The source path.
@ -339,13 +342,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
    """
    import smart_open

+    # Create parent directories for local paths
+    if isinstance(dest, Path):
+        if not dest.parent.exists():
+            dest.parent.mkdir(parents=True)
+
    dest = str(dest)
    with smart_open.open(dest, mode="wb") as output_file:
        with src.open(mode="rb") as input_file:
            output_file.write(input_file.read())


-def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
+def download_file(
+    src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
+) -> None:
    """Download a file using smart_open.

    url (str): The URL of the file.
@ -358,9 +368,9 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    if dest.exists() and not force:
        return None
    src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
-            output_file.write(input_file.read())
+            shutil.copyfileobj(input_file, output_file)


 def ensure_pathy(path):
@ -368,7 +378,7 @@ def ensure_pathy(path):
    slow and annoying Google Cloud warning)."""
    from pathy import Pathy  # noqa: F811

-    return Pathy(path)
+    return Pathy.fluid(path)


 def git_checkout(
@ -462,6 +472,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
        shutil.move(str(source_path), str(dest))


+def git_repo_branch_exists(repo: str, branch: str) -> bool:
+    """Uses 'git ls-remote' to check if a repository and branch exists
+
+    repo (str): URL to get repo.
+    branch (str): Branch on repo to check.
+    RETURNS (bool): True if repo:branch exists.
+    """
+    get_git_version()
+    cmd = f"git ls-remote {repo} {branch}"
+    # We might be tempted to use `--exit-code` with `git ls-remote`, but
+    # `run_command` handles the `returncode` for us, so we'll rely on
+    # the fact that stdout returns '' if the requested branch doesn't exist
+    ret = run_command(cmd, capture=True)
+    exists = ret.stdout != ""
+    return exists
+
+
 def get_git_version(
    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 ) -> Tuple[int, int]:
@ -554,5 +581,41 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
        require_gpu(use_gpu)
    else:
        local_msg.info("Using CPU")
-        if has_cupy and gpu_is_available():
+        if gpu_is_available():
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
+
+
+def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+    """Given a directory and a suffix, recursively find all files matching the suffix.
+    Directories or files with names beginning with a . are ignored, but hidden flags on
+    filesystems are not checked.
+    When provided with a suffix `None`, there is no suffix-based filtering."""
+    if not path.is_dir():
+        return [path]
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        elif suffix is not None and not path.parts[-1].endswith(suffix):
+            continue
+        else:
+            locs.append(path)
+    # It's good to sort these, in case the ordering messes up cache.
+    locs.sort()
+    return locs
+
+
+def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
+    """Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
+    as happens with `round(number, ndigits)`"""
+    if isinstance(number, float):
+        return f"{number:.{ndigits}f}"
+    else:
+        return str(number)
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -0,0 +1,143 @@
+import tqdm
+import srsly
+
+from itertools import chain
+from pathlib import Path
+from typing import Optional, List, Iterable, cast, Union
+
+from wasabi import msg
+
+from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
+
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
+from ..util import ensure_path, load_model
+
+
+path_help = """Location of the documents to predict on.
+Can be a single file in .spacy format or a .jsonl file.
+Files with other extensions are treated as single plain text documents.
+If a directory is provided it is traversed recursively to grab
+all files to be processed.
+The files can be a mixture of .spacy, .jsonl and text files.
+If .jsonl is provided the specified field is going
+to be grabbed ("text" by default)."""
+
+out_help = "Path to save the resulting .spacy file"
+code_help = (
+    "Path to Python file with additional " "code (registered functions) to be imported"
+)
+gold_help = "Use gold preprocessing provided in the .spacy files"
+force_msg = (
+    "The provided output file already exists. "
+    "To force overwriting the output file, set the --force or -F flag."
+)
+
+
+DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
+
+
+def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
+    """
+    Stream Doc objects from DocBin.
+    """
+    docbin = DocBin().from_disk(path)
+    for doc in docbin.get_docs(vocab):
+        yield doc
+
+
+def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
+    """
+    Stream "text" field from JSONL. If the field "text" is
+    not found it raises error.
+    """
+    for entry in srsly.read_jsonl(path):
+        if field not in entry:
+            msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
+        else:
+            yield entry[field]
+
+
+def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
+    """
+    Yields strings from text files in paths.
+    """
+    for path in paths:
+        with open(path, "r") as fin:
+            text = fin.read()
+            yield text
+
+
+@app.command("apply")
+def apply_cli(
+    # fmt: off
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help=path_help, exists=True),
+    output_file: Path = Arg(..., help=out_help, dir_okay=False),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
+    text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
+    force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
+    batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
+    n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
+):
+    """
+    Apply a trained pipeline to documents to get predictions.
+    Expects a loadable spaCy pipeline and path to the data, which
+    can be a directory or a file.
+    The data files can be provided in multiple formats:
+        1. .spacy files
+        2. .jsonl files with a specified "field" to read the text from.
+        3. Files with any other extension are assumed to be containing
+           a single document.
+    DOCS: https://spacy.io/api/cli#apply
+    """
+    data_path = ensure_path(data_path)
+    output_file = ensure_path(output_file)
+    code_path = ensure_path(code_path)
+    if output_file.exists() and not force_overwrite:
+        msg.fail(force_msg, exits=1)
+    if not data_path.exists():
+        msg.fail(f"Couldn't find data path: {data_path}", exits=1)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    apply(data_path, output_file, model, text_key, batch_size, n_process)
+
+
+def apply(
+    data_path: Path,
+    output_file: Path,
+    model: str,
+    json_field: str,
+    batch_size: int,
+    n_process: int,
+):
+    docbin = DocBin(store_user_data=True)
+    paths = walk_directory(data_path)
+    if len(paths) == 0:
+        docbin.to_disk(output_file)
+        msg.warn(
+            "Did not find data to process,"
+            f" {data_path} seems to be an empty directory."
+        )
+        return
+    nlp = load_model(model)
+    msg.good(f"Loaded model {model}")
+    vocab = nlp.vocab
+    streams: List[DocOrStrStream] = []
+    text_files = []
+    for path in paths:
+        if path.suffix == ".spacy":
+            streams.append(_stream_docbin(path, vocab))
+        elif path.suffix == ".jsonl":
+            streams.append(_stream_jsonl(path, json_field))
+        else:
+            text_files.append(path)
+    if len(text_files) > 0:
+        streams.append(_stream_texts(text_files))
+    datagen = cast(DocOrStrStream, chain(*streams))
+    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
+        docbin.add(doc)
+    if output_file.suffix == "":
+        output_file = output_file.with_suffix(".spacy")
+    docbin.to_disk(output_file)
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@ -0,0 +1,174 @@
+from typing import Iterable, List, Optional
+import random
+from itertools import islice
+import numpy
+from pathlib import Path
+import time
+from tqdm import tqdm
+import typer
+from wasabi import msg
+
+from .. import util
+from ..language import Language
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, benchmark_cli, setup_gpu
+
+
+@benchmark_cli.command(
+    "speed",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def benchmark_speed_cli(
+    # fmt: off
+    ctx: typer.Context,
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
+    batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
+    no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
+    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
+    # fmt: on
+):
+    """
+    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
+    data in the binary .spacy format.
+    """
+    setup_gpu(use_gpu=use_gpu, silent=False)
+
+    nlp = util.load_model(model)
+    batch_size = batch_size if batch_size is not None else nlp.batch_size
+    corpus = Corpus(data_path)
+    docs = [eg.predicted for eg in corpus(nlp)]
+
+    if len(docs) == 0:
+        msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
+
+    print(f"Warming up for {warmup_epochs} epochs...")
+    warmup(nlp, docs, warmup_epochs, batch_size)
+
+    print()
+    print(f"Benchmarking {n_batches} batches...")
+    wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
+
+    print()
+    print_outliers(wps)
+    print_mean_with_ci(wps)
+
+
+# Lowercased, behaves as a context manager function.
+class time_context:
+    """Register the running time of a context."""
+
+    def __enter__(self):
+        self.start = time.perf_counter()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.elapsed = time.perf_counter() - self.start
+
+
+class Quartiles:
+    """Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
+    of a sample."""
+
+    q1: float
+    q2: float
+    q3: float
+    iqr: float
+
+    def __init__(self, sample: numpy.ndarray) -> None:
+        self.q1 = numpy.quantile(sample, 0.25)
+        self.q2 = numpy.quantile(sample, 0.5)
+        self.q3 = numpy.quantile(sample, 0.75)
+        self.iqr = self.q3 - self.q1
+
+
+def annotate(
+    nlp: Language, docs: List[Doc], batch_size: Optional[int]
+) -> numpy.ndarray:
+    docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
+    wps = []
+    while True:
+        with time_context() as elapsed:
+            batch_docs = list(
+                islice(docs, batch_size if batch_size else nlp.batch_size)
+            )
+        if len(batch_docs) == 0:
+            break
+        n_tokens = count_tokens(batch_docs)
+        wps.append(n_tokens / elapsed.elapsed)
+
+    return numpy.array(wps)
+
+
+def benchmark(
+    nlp: Language,
+    docs: List[Doc],
+    n_batches: int,
+    batch_size: int,
+    shuffle: bool,
+) -> numpy.ndarray:
+    if shuffle:
+        bench_docs = [
+            nlp.make_doc(random.choice(docs).text)
+            for _ in range(n_batches * batch_size)
+        ]
+    else:
+        bench_docs = [
+            nlp.make_doc(docs[i % len(docs)].text)
+            for i in range(n_batches * batch_size)
+        ]
+
+    return annotate(nlp, bench_docs, batch_size)
+
+
+def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
+    """Apply a statistic to repeated random samples of an array."""
+    return numpy.fromiter(
+        (
+            statistic(numpy.random.choice(x, len(x), replace=True))
+            for _ in range(iterations)
+        ),
+        numpy.float64,
+    )
+
+
+def count_tokens(docs: Iterable[Doc]) -> int:
+    return sum(len(doc) for doc in docs)
+
+
+def print_mean_with_ci(sample: numpy.ndarray):
+    mean = numpy.mean(sample)
+    bootstrap_means = bootstrap(sample)
+    bootstrap_means.sort()
+
+    # 95% confidence interval
+    low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
+    high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
+
+    print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
+
+
+def print_outliers(sample: numpy.ndarray):
+    quartiles = Quartiles(sample)
+
+    n_outliers = numpy.sum(
+        (sample < (quartiles.q1 - 1.5 * quartiles.iqr))
+        | (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
+    )
+    n_extreme_outliers = numpy.sum(
+        (sample < (quartiles.q1 - 3.0 * quartiles.iqr))
+        | (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
+    )
+    print(
+        f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
+    )
+
+
+def warmup(
+    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
+) -> numpy.ndarray:
+    docs = warmup_epochs * docs
+    return annotate(nlp, docs, batch_size)
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, Union
 from enum import Enum
 from pathlib import Path
 from wasabi import Printer
@ -7,7 +7,7 @@ import re
 import sys
 import itertools

-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, walk_directory
 from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
    "json": json_to_docs,
 }

+AUTO = "auto"
+

 # File types that can be written to stdout
 FILE_TYPES_STDOUT = ("json",)
@ -49,7 +51,7 @@ def convert_cli(
    model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
-    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
+    converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
    ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
    concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@ -70,8 +72,8 @@ def convert_cli(
    output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
    silent = output_dir == "-"
    msg = Printer(no_print=silent)
-    verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
    converter = _get_converter(msg, converter, input_path)
+    verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
    convert(
        input_path,
        output_dir,
@ -100,7 +102,7 @@ def convert(
    model: Optional[str] = None,
    morphology: bool = False,
    merge_subtokens: bool = False,
-    converter: str = "auto",
+    converter: str,
    ner_map: Optional[Path] = None,
    lang: Optional[str] = None,
    concatenate: bool = False,
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
    return None


-def walk_directory(path: Path, converter: str) -> List[Path]:
-    if not path.is_dir():
-        return [path]
-    paths = [path]
-    locs = []
-    seen = set()
-    for path in paths:
-        if str(path) in seen:
-            continue
-        seen.add(str(path))
-        if path.parts[-1].startswith("."):
-            continue
-        elif path.is_dir():
-            paths.extend(path.iterdir())
-        elif converter == "json" and not path.parts[-1].endswith("json"):
-            continue
-        elif converter == "conll" and not path.parts[-1].endswith("conll"):
-            continue
-        elif converter == "iob" and not path.parts[-1].endswith("iob"):
-            continue
-        else:
-            locs.append(path)
-    # It's good to sort these, in case the ordering messes up cache.
-    locs.sort()
-    return locs
-
-
 def verify_cli_args(
    msg: Printer,
    input_path: Path,
@ -239,18 +214,22 @@ def verify_cli_args(
        input_locs = walk_directory(input_path, converter)
        if len(input_locs) == 0:
            msg.fail("No input files in directory", input_path, exits=1)
-        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
-        if converter == "auto" and len(file_types) >= 2:
-            file_types_str = ",".join(file_types)
-            msg.fail("All input files must be same type", file_types_str, exits=1)
-    if converter != "auto" and converter not in CONVERTERS:
+    if converter not in CONVERTERS:
        msg.fail(f"Can't find converter for {converter}", exits=1)


 def _get_converter(msg, converter, input_path: Path):
    if input_path.is_dir():
-        input_path = walk_directory(input_path, converter)[0]
-    if converter == "auto":
+        if converter == AUTO:
+            input_locs = walk_directory(input_path, suffix=None)
+            file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+            if len(file_types) >= 2:
+                file_types_str = ",".join(file_types)
+                msg.fail("All input files must be same type", file_types_str, exits=1)
+            input_path = input_locs[0]
+        else:
+            input_path = walk_directory(input_path, suffix=converter)[0]
+    if converter == AUTO:
        converter = input_path.suffix[1:]
    if converter == "ner" or converter == "iob":
        with input_path.open(encoding="utf8") as file_:
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -6,12 +6,14 @@ import sys
 import srsly
 from wasabi import Printer, MESSAGES, msg
 import typer
+import math

 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli
-from ..training import Example
+from ._util import import_code, debug_cli, _format_number
+from ..training import Example, remove_bilu_prefix
 from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
+from ..pipeline import TrainablePipe
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
 from ..pipeline import Morphologizer, SpanCategorizer
@ -19,6 +21,7 @@ from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
 from ..compat import Literal
+from ..vectors import Mode as VectorsMode
 from .. import util


@ -29,6 +32,12 @@ DEP_LABEL_THRESHOLD = 20
 # Minimum number of expected examples to train a new pipeline
 BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000
+# Arbitrary threshold where SpanCat performs well
+SPAN_DISTINCT_THRESHOLD = 1
+# Arbitrary threshold where SpanCat performs well
+BOUNDARY_DISTINCT_THRESHOLD = 1
+# Arbitrary threshold for filtering span lengths during reporting (percentage)
+SPAN_LENGTH_THRESHOLD_PERCENTAGE = 90


@debug_cli.command(
@ -170,26 +179,34 @@ def debug_data(
        show=verbose,
    )
    if len(nlp.vocab.vectors):
-        msg.info(
-            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
-            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
-        )
-        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
-        msg.warn(
-            "{} words in training data without vectors ({:.0f}%)".format(
-                n_missing_vectors,
-                100 * (n_missing_vectors / gold_train_data["n_words"]),
-            ),
-        )
-        msg.text(
-            "10 most common words without vectors: {}".format(
-                _format_labels(
-                    gold_train_data["words_missing_vectors"].most_common(10),
-                    counts=True,
-                )
-            ),
-            show=verbose,
-        )
+        if nlp.vocab.vectors.mode == VectorsMode.floret:
+            msg.info(
+                f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
+                f"{nlp.vocab.vectors_length} dimensions, "
+                f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
+                f"n-gram subwords"
+            )
+        else:
+            msg.info(
+                f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
+                f"unique keys, {nlp.vocab.vectors_length} dimensions)"
+            )
+            n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
+            msg.warn(
+                "{} words in training data without vectors ({:.0f}%)".format(
+                    n_missing_vectors,
+                    100 * (n_missing_vectors / gold_train_data["n_words"]),
+                ),
+            )
+            msg.text(
+                "10 most common words without vectors: {}".format(
+                    _format_labels(
+                        gold_train_data["words_missing_vectors"].most_common(10),
+                        counts=True,
+                    )
+                ),
+                show=verbose,
+            )
    else:
        msg.info("No word vectors present in the package")

@ -238,6 +255,69 @@ def debug_data(
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

+            with msg.loading("Obtaining span characteristics..."):
+                span_characteristics = _get_span_characteristics(
+                    train_dataset, gold_train_data, spans_key
+                )
+
+            msg.info(f"Span characteristics for spans_key '{spans_key}'")
+            msg.info("SD = Span Distinctiveness, BD = Boundary Distinctiveness")
+            _print_span_characteristics(span_characteristics)
+
+            _span_freqs = _get_spans_length_freq_dist(
+                gold_train_data["spans_length"][spans_key]
+            )
+            _filtered_span_freqs = _filter_spans_length_freq_dist(
+                _span_freqs, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE
+            )
+
+            msg.info(
+                f"Over {SPAN_LENGTH_THRESHOLD_PERCENTAGE}% of spans have lengths of 1 -- "
+                f"{max(_filtered_span_freqs.keys())} "
+                f"(min={span_characteristics['min_length']}, max={span_characteristics['max_length']}). "
+                f"The most common span lengths are: {_format_freqs(_filtered_span_freqs)}. "
+                "If you are using the n-gram suggester, note that omitting "
+                "infrequent n-gram lengths can greatly improve speed and "
+                "memory usage."
+            )
+
+            msg.text(
+                f"Full distribution of span lengths: {_format_freqs(_span_freqs)}",
+                show=verbose,
+            )
+
+            # Add report regarding span characteristics
+            if span_characteristics["avg_sd"] < SPAN_DISTINCT_THRESHOLD:
+                msg.warn("Spans may not be distinct from the rest of the corpus")
+            else:
+                msg.good("Spans are distinct from the rest of the corpus")
+
+            p_spans = span_characteristics["p_spans"].values()
+            all_span_tokens: Counter = sum(p_spans, Counter())
+            most_common_spans = [w for w, _ in all_span_tokens.most_common(10)]
+            msg.text(
+                "10 most common span tokens: {}".format(
+                    _format_labels(most_common_spans)
+                ),
+                show=verbose,
+            )
+
+            # Add report regarding span boundary characteristics
+            if span_characteristics["avg_bd"] < BOUNDARY_DISTINCT_THRESHOLD:
+                msg.warn("Boundary tokens are not distinct from the rest of the corpus")
+            else:
+                msg.good("Boundary tokens are distinct from the rest of the corpus")
+
+            p_bounds = span_characteristics["p_bounds"].values()
+            all_span_bound_tokens: Counter = sum(p_bounds, Counter())
+            most_common_bounds = [w for w, _ in all_span_bound_tokens.most_common(10)]
+            msg.text(
+                "10 most common span boundary tokens: {}".format(
+                    _format_labels(most_common_bounds)
+                ),
+                show=verbose,
+            )
+
        if has_low_data_warning:
            msg.text(
                f"To train a new span type, your data should include at "
@ -282,7 +362,7 @@ def debug_data(
            if label != "-"
        ]
        labels_with_counts = _format_labels(labels_with_counts, counts=True)
-        msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
+        msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
@ -638,6 +718,9 @@ def _compile_gold(
        "words": Counter(),
        "roots": Counter(),
        "spancat": dict(),
+        "spans_length": dict(),
+        "spans_per_type": dict(),
+        "sb_per_type": dict(),
        "ws_ents": 0,
        "boundary_cross_ents": 0,
        "n_words": 0,
@ -676,21 +759,66 @@ def _compile_gold(
                    # "Illegal" whitespace entity
                    data["ws_ents"] += 1
                if label.startswith(("B-", "U-")):
-                    combined_label = label.split("-")[1]
+                    combined_label = remove_bilu_prefix(label)
                    data["ner"][combined_label] += 1
-                if sent_starts[i] == True and label.startswith(("I-", "L-")):
+                if sent_starts[i] and label.startswith(("I-", "L-")):
                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
        if "spancat" in factory_names:
-            for span_key in list(eg.reference.spans.keys()):
-                if span_key not in data["spancat"]:
-                    data["spancat"][span_key] = Counter()
-                for i, span in enumerate(eg.reference.spans[span_key]):
+            for spans_key in list(eg.reference.spans.keys()):
+                # Obtain the span frequency
+                if spans_key not in data["spancat"]:
+                    data["spancat"][spans_key] = Counter()
+                for i, span in enumerate(eg.reference.spans[spans_key]):
                    if span.label_ is None:
                        continue
                    else:
-                        data["spancat"][span_key][span.label_] += 1
+                        data["spancat"][spans_key][span.label_] += 1
+
+                # Obtain the span length
+                if spans_key not in data["spans_length"]:
+                    data["spans_length"][spans_key] = dict()
+                for span in gold.spans[spans_key]:
+                    if span.label_ is None:
+                        continue
+                    if span.label_ not in data["spans_length"][spans_key]:
+                        data["spans_length"][spans_key][span.label_] = []
+                    data["spans_length"][spans_key][span.label_].append(len(span))
+
+                # Obtain spans per span type
+                if spans_key not in data["spans_per_type"]:
+                    data["spans_per_type"][spans_key] = dict()
+                for span in gold.spans[spans_key]:
+                    if span.label_ not in data["spans_per_type"][spans_key]:
+                        data["spans_per_type"][spans_key][span.label_] = []
+                    data["spans_per_type"][spans_key][span.label_].append(span)
+
+                # Obtain boundary tokens per span type
+                window_size = 1
+                if spans_key not in data["sb_per_type"]:
+                    data["sb_per_type"][spans_key] = dict()
+                for span in gold.spans[spans_key]:
+                    if span.label_ not in data["sb_per_type"][spans_key]:
+                        # Creating a data structure that holds the start and
+                        # end tokens for each span type
+                        data["sb_per_type"][spans_key][span.label_] = {
+                            "start": [],
+                            "end": [],
+                        }
+                    for offset in range(window_size):
+                        sb_start_idx = span.start - (offset + 1)
+                        if sb_start_idx >= 0:
+                            data["sb_per_type"][spans_key][span.label_]["start"].append(
+                                gold[sb_start_idx : sb_start_idx + 1]
+                            )
+
+                        sb_end_idx = span.end + (offset + 1)
+                        if sb_end_idx <= len(gold):
+                            data["sb_per_type"][spans_key][span.label_]["end"].append(
+                                gold[sb_end_idx - 1 : sb_end_idx]
+                            )
+
        if "textcat" in factory_names or "textcat_multilabel" in factory_names:
            data["cats"].update(gold.cats)
            if any(val not in (0, 1) for val in gold.cats.values()):
@ -761,6 +889,16 @@ def _format_labels(
    return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])


+def _format_freqs(freqs: Dict[int, float], sort: bool = True) -> str:
+    if sort:
+        freqs = dict(sorted(freqs.items()))
+
+    _freqs = [(str(k), v) for k, v in freqs.items()]
+    return ", ".join(
+        [f"{l} ({c}%)" for l, c in cast(Iterable[Tuple[str, float]], _freqs)]
+    )
+
+
 def _get_examples_without_label(
    data: Sequence[Example],
    label: str,
@ -771,7 +909,7 @@ def _get_examples_without_label(
    for eg in data:
        if component == "ner":
            labels = [
-                label.split("-")[1]
+                remove_bilu_prefix(label)
                for label in eg.get_aligned_ner()
                if label not in ("O", "-", None)
            ]
@ -797,6 +935,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
    labels: Set[str] = set()
    for pipe_name in pipe_names:
        pipe = nlp.get_pipe(pipe_name)
+        assert isinstance(pipe, TrainablePipe)
        labels.update(pipe.labels)
    return labels

@ -815,3 +954,177 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
            labels[pipe.key] = set()
        labels[pipe.key].update(pipe.labels)
    return labels
+
+
+def _gmean(l: List) -> float:
+    """Compute geometric mean of a list"""
+    return math.exp(math.fsum(math.log(i) for i in l) / len(l))
+
+
+def _wgt_average(metric: Dict[str, float], frequencies: Counter) -> float:
+    total = sum(value * frequencies[span_type] for span_type, value in metric.items())
+    return total / sum(frequencies.values())
+
+
+def _get_distribution(docs, normalize: bool = True) -> Counter:
+    """Get the frequency distribution given a set of Docs"""
+    word_counts: Counter = Counter()
+    for doc in docs:
+        for token in doc:
+            # Normalize the text
+            t = token.text.lower().replace("``", '"').replace("''", '"')
+            word_counts[t] += 1
+    if normalize:
+        total = sum(word_counts.values(), 0.0)
+        word_counts = Counter({k: v / total for k, v in word_counts.items()})
+    return word_counts
+
+
+def _get_kl_divergence(p: Counter, q: Counter) -> float:
+    """Compute the Kullback-Leibler divergence from two frequency distributions"""
+    total = 0.0
+    for word, p_word in p.items():
+        total += p_word * math.log(p_word / q[word])
+    return total
+
+
+def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
+    """Compile into one list for easier reporting"""
+    d = {
+        label: [label] + list(_format_number(d[label]) for d in span_data)
+        for label in labels
+    }
+    return list(d.values())
+
+
+def _get_span_characteristics(
+    examples: List[Example], compiled_gold: Dict[str, Any], spans_key: str
+) -> Dict[str, Any]:
+    """Obtain all span characteristics"""
+    data_labels = compiled_gold["spancat"][spans_key]
+    # Get lengths
+    span_length = {
+        label: _gmean(l)
+        for label, l in compiled_gold["spans_length"][spans_key].items()
+    }
+    spans_per_type = {
+        label: len(spans)
+        for label, spans in compiled_gold["spans_per_type"][spans_key].items()
+    }
+    min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
+    max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
+
+    # Get relevant distributions: corpus, spans, span boundaries
+    p_corpus = _get_distribution([eg.reference for eg in examples], normalize=True)
+    p_spans = {
+        label: _get_distribution(spans, normalize=True)
+        for label, spans in compiled_gold["spans_per_type"][spans_key].items()
+    }
+    p_bounds = {
+        label: _get_distribution(sb["start"] + sb["end"], normalize=True)
+        for label, sb in compiled_gold["sb_per_type"][spans_key].items()
+    }
+
+    # Compute for actual span characteristics
+    span_distinctiveness = {
+        label: _get_kl_divergence(freq_dist, p_corpus)
+        for label, freq_dist in p_spans.items()
+    }
+    sb_distinctiveness = {
+        label: _get_kl_divergence(freq_dist, p_corpus)
+        for label, freq_dist in p_bounds.items()
+    }
+
+    return {
+        "sd": span_distinctiveness,
+        "bd": sb_distinctiveness,
+        "spans_per_type": spans_per_type,
+        "lengths": span_length,
+        "min_length": min(min_lengths),
+        "max_length": max(max_lengths),
+        "avg_sd": _wgt_average(span_distinctiveness, data_labels),
+        "avg_bd": _wgt_average(sb_distinctiveness, data_labels),
+        "avg_length": _wgt_average(span_length, data_labels),
+        "labels": list(data_labels.keys()),
+        "p_spans": p_spans,
+        "p_bounds": p_bounds,
+    }
+
+
+def _print_span_characteristics(span_characteristics: Dict[str, Any]):
+    """Print all span characteristics into a table"""
+    headers = ("Span Type", "Length", "SD", "BD", "N")
+    # Wasabi has this at 30 by default, but we might have some long labels
+    max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
+    # Prepare table data with all span characteristics
+    table_data = [
+        span_characteristics["lengths"],
+        span_characteristics["sd"],
+        span_characteristics["bd"],
+        span_characteristics["spans_per_type"],
+    ]
+    table = _format_span_row(
+        span_data=table_data, labels=span_characteristics["labels"]
+    )
+    # Prepare table footer with weighted averages
+    footer_data = [
+        span_characteristics["avg_length"],
+        span_characteristics["avg_sd"],
+        span_characteristics["avg_bd"],
+    ]
+
+    footer = (
+        ["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
+    )
+    msg.table(
+        table,
+        footer=footer,
+        header=headers,
+        divider=True,
+        aligns=["l"] + ["r"] * (len(footer_data) + 1),
+        max_col=max_col,
+    )
+
+
+def _get_spans_length_freq_dist(
+    length_dict: Dict, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE
+) -> Dict[int, float]:
+    """Get frequency distribution of spans length under a certain threshold"""
+    all_span_lengths = []
+    for _, lengths in length_dict.items():
+        all_span_lengths.extend(lengths)
+
+    freq_dist: Counter = Counter()
+    for i in all_span_lengths:
+        if freq_dist.get(i):
+            freq_dist[i] += 1
+        else:
+            freq_dist[i] = 1
+
+    # We will be working with percentages instead of raw counts
+    freq_dist_percentage = {}
+    for span_length, count in freq_dist.most_common():
+        percentage = (count / len(all_span_lengths)) * 100.0
+        percentage = round(percentage, 2)
+        freq_dist_percentage[span_length] = percentage
+
+    return freq_dist_percentage
+
+
+def _filter_spans_length_freq_dist(
+    freq_dist: Dict[int, float], threshold: int
+) -> Dict[int, float]:
+    """Filter frequency distribution with respect to a threshold
+
+    We're going to filter all the span lengths that fall
+    around a percentage threshold when summed.
+    """
+    total = 0.0
+    filtered_freq_dist = {}
+    for span_length, dist in freq_dist.items():
+        if total >= threshold:
+            break
+        else:
+            filtered_freq_dist[span_length] = dist
+        total += dist
+    return filtered_freq_dist
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@ -0,0 +1,89 @@
+from typing import Optional
+
+import typer
+from wasabi import Printer, diff_strings, MarkdownRenderer
+from pathlib import Path
+from thinc.api import Config
+
+from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
+from ..util import load_config
+from .init_config import init_config, Optimizations
+
+
+@debug_cli.command(
+    "diff-config",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def debug_diff_cli(
+    # fmt: off
+    ctx: typer.Context,
+    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
+    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
+    gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
+    pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
+    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
+    # fmt: on
+):
+    """Show a diff of a config file with respect to spaCy's defaults or another config file. If
+    additional settings were used in the creation of the config file, then you
+    must supply these as extra parameters to the command when comparing to the default settings. The generated diff
+    can also be used when posting to the discussion forum to provide more
+    information for the maintainers.
+
+    The `optimize`, `gpu`, and `pretraining` options are only relevant when
+    comparing against the default configuration (or specifically when `compare_to` is None).
+
+    DOCS: https://spacy.io/api/cli#debug-diff
+    """
+    debug_diff(
+        config_path=config_path,
+        compare_to=compare_to,
+        gpu=gpu,
+        optimize=optimize,
+        pretraining=pretraining,
+        markdown=markdown,
+    )
+
+
+def debug_diff(
+    config_path: Path,
+    compare_to: Optional[Path],
+    gpu: bool,
+    optimize: Optimizations,
+    pretraining: bool,
+    markdown: bool,
+):
+    msg = Printer()
+    with show_validation_error(hint_fill=False):
+        user_config = load_config(config_path)
+        if compare_to:
+            other_config = load_config(compare_to)
+        else:
+            # Recreate a default config based from user's config
+            lang = user_config["nlp"]["lang"]
+            pipeline = list(user_config["nlp"]["pipeline"])
+            msg.info(f"Found user-defined language: '{lang}'")
+            msg.info(f"Found user-defined pipelines: {pipeline}")
+            other_config = init_config(
+                lang=lang,
+                pipeline=pipeline,
+                optimize=optimize.value,
+                gpu=gpu,
+                pretraining=pretraining,
+                silent=True,
+            )
+
+    user = user_config.to_str()
+    other = other_config.to_str()
+
+    if user == other:
+        msg.warn("No diff to show: configs are identical")
+    else:
+        diff_text = diff_strings(other, user, add_symbols=markdown)
+        if markdown:
+            md = MarkdownRenderer()
+            md.add(md.code_block(diff_text, "diff"))
+            print(md.text)
+        else:
+            print(diff_text)
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -7,6 +7,7 @@ import typer
 from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
 from .. import about
 from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 from ..errors import OLD_MODEL_SHORTCUTS


@ -19,7 +20,7 @@ def download_cli(
    ctx: typer.Context,
    model: str = Arg(..., help="Name of pipeline package to download"),
    direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
-    sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
+    sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
    # fmt: on
 ):
    """
@ -35,7 +36,12 @@ def download_cli(
    download(model, direct, sdist, *ctx.args)


-def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
+def download(
+    model: str,
+    direct: bool = False,
+    sdist: bool = False,
+    *pip_args,
+) -> None:
    if (
        not (is_package("spacy") or is_package("spacy-nightly"))
        and "--no-deps" not in pip_args
@ -49,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
            "dependencies, you'll have to install them manually."
        )
        pip_args = pip_args + ("--no-deps",)
-    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
-    dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
    if direct:
        components = model.split("-")
        model_name = "".join(components[:-1])
        version = components[-1]
-        download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
    else:
        model_name = model
        if model in OLD_MODEL_SHORTCUTS:
@ -66,15 +69,31 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
            model_name = OLD_MODEL_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
+
+    filename = get_model_filename(model_name, version, sdist)
+
+    download_model(filename, pip_args)
    msg.good(
        "Download and installation successful",
        f"You can now load the package via spacy.load('{model_name}')",
    )


+def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
+    dl_tpl = "{m}-{v}/{m}-{v}{s}"
+    egg_tpl = "#egg={m}=={v}"
+    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
+    filename = dl_tpl.format(m=model_name, v=version, s=suffix)
+    if sdist:
+        filename += egg_tpl.format(m=model_name, v=version)
+    return filename
+
+
 def get_compatibility() -> dict:
-    version = get_minor_version(about.__version__)
+    if is_prerelease_version(about.__version__):
+        version: Optional[str] = about.__version__
+    else:
+        version = get_minor_version(about.__version__)
    r = requests.get(about.__compatibility__)
    if r.status_code != 200:
        msg.fail(
@ -101,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
    return comp[model][0]


+def get_latest_version(model: str) -> str:
+    comp = get_compatibility()
+    return get_version(model, comp)
+
+
 def download_model(
    filename: str, user_pip_args: Optional[Sequence[str]] = None
 ) -> None:
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -7,12 +7,15 @@ from thinc.api import fix_random_seed

 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code
+from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
 from ..scorer import Scorer
 from .. import util
 from .. import displacy


+@benchmark_cli.command(
+    "accuracy",
+)
@app.command("evaluate")
 def evaluate_cli(
    # fmt: off
@ -36,7 +39,7 @@ def evaluate_cli(
    dependency parses in a HTML file, set as output directory as the
    displacy_path argument.

-    DOCS: https://spacy.io/api/cli#evaluate
+    DOCS: https://spacy.io/api/cli#benchmark-accuracy
    """
    import_code(code_path)
    evaluate(
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -0,0 +1,233 @@
+import functools
+import operator
+from pathlib import Path
+import logging
+from typing import Optional, Tuple, Any, Dict, List
+
+import numpy
+import wasabi.tables
+
+from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
+from ..errors import Errors
+from ..training import Corpus
+from ._util import app, Arg, Opt, import_code, setup_gpu
+from .. import util
+
+_DEFAULTS = {
+    "n_trials": 11,
+    "use_gpu": -1,
+    "gold_preproc": False,
+}
+
+
+@app.command(
+    "find-threshold",
+    context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
+)
+def find_threshold_cli(
+    # fmt: off
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
+    pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
+    threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
+    scores_key: str = Arg(..., help="Metric to optimize"),
+    n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
+    verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
+    # fmt: on
+):
+    """
+    Runs prediction trials for a trained model with varying tresholds to maximize
+    the specified metric. The search space for the threshold is traversed linearly
+    from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
+    (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
+    returns all results).
+
+    This is applicable only for components whose predictions are influenced by
+    thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
+    that the full path to the corresponding threshold attribute in the config has to
+    be provided.
+
+    DOCS: https://spacy.io/api/cli#find-threshold
+    """
+
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    import_code(code_path)
+    find_threshold(
+        model=model,
+        data_path=data_path,
+        pipe_name=pipe_name,
+        threshold_key=threshold_key,
+        scores_key=scores_key,
+        n_trials=n_trials,
+        use_gpu=use_gpu,
+        gold_preproc=gold_preproc,
+        silent=False,
+    )
+
+
+def find_threshold(
+    model: str,
+    data_path: Path,
+    pipe_name: str,
+    threshold_key: str,
+    scores_key: str,
+    *,
+    n_trials: int = _DEFAULTS["n_trials"],  # type: ignore
+    use_gpu: int = _DEFAULTS["use_gpu"],  # type: ignore
+    gold_preproc: bool = _DEFAULTS["gold_preproc"],  # type: ignore
+    silent: bool = True,
+) -> Tuple[float, float, Dict[float, float]]:
+    """
+    Runs prediction trials for models with varying tresholds to maximize the specified metric.
+    model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
+    data_path (Path): Path to file with DocBin with docs to use for threshold search.
+    pipe_name (str): Name of pipe to examine thresholds for.
+    threshold_key (str): Key of threshold attribute in component's configuration.
+    scores_key (str): Name of score to metric to optimize.
+    n_trials (int): Number of trials to determine optimal thresholds.
+    use_gpu (int): GPU ID or -1 for CPU.
+    gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
+        tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
+        to train/test skew.
+    silent (bool): Whether to print non-error-related output to stdout.
+    RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
+        evaluated thresholds.
+    """
+
+    setup_gpu(use_gpu, silent=silent)
+    data_path = util.ensure_path(data_path)
+    if not data_path.exists():
+        wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
+    nlp = util.load_model(model)
+
+    if pipe_name not in nlp.component_names:
+        raise AttributeError(
+            Errors.E001.format(name=pipe_name, opts=nlp.component_names)
+        )
+    pipe = nlp.get_pipe(pipe_name)
+    if not hasattr(pipe, "scorer"):
+        raise AttributeError(Errors.E1045)
+
+    if type(pipe) == TextCategorizer:
+        wasabi.msg.warn(
+            "The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
+            "exclusive classes. All thresholds will yield the same results."
+        )
+
+    if not silent:
+        wasabi.msg.info(
+            title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
+            f"trials."
+        )
+
+    # Load evaluation corpus.
+    corpus = Corpus(data_path, gold_preproc=gold_preproc)
+    dev_dataset = list(corpus(nlp))
+    config_keys = threshold_key.split(".")
+
+    def set_nested_item(
+        config: Dict[str, Any], keys: List[str], value: float
+    ) -> Dict[str, Any]:
+        """Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
+        config (Dict[str, Any]): Configuration dictionary.
+        keys (List[Any]): Path to value to set.
+        value (float): Value to set.
+        RETURNS (Dict[str, Any]): Updated dictionary.
+        """
+        functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
+        return config
+
+    def filter_config(
+        config: Dict[str, Any], keys: List[str], full_key: str
+    ) -> Dict[str, Any]:
+        """Filters provided config dictionary so that only the specified keys path remains.
+        config (Dict[str, Any]): Configuration dictionary.
+        keys (List[Any]): Path to value to set.
+        full_key (str): Full user-specified key.
+        RETURNS (Dict[str, Any]): Filtered dictionary.
+        """
+        if keys[0] not in config:
+            wasabi.msg.fail(
+                title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
+                text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
+                f"{list(config.keys())}",
+                exits=1,
+            )
+        return {
+            keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
+            if len(keys) > 1
+            else config[keys[0]]
+        }
+
+    # Evaluate with varying threshold values.
+    scores: Dict[float, float] = {}
+    config_keys_full = ["components", pipe_name, *config_keys]
+    table_col_widths = (10, 10)
+    thresholds = numpy.linspace(0, 1, n_trials)
+    print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
+    for threshold in thresholds:
+        # Reload pipeline with overrides specifying the new threshold.
+        nlp = util.load_model(
+            model,
+            config=set_nested_item(
+                filter_config(
+                    nlp.config, config_keys_full, ".".join(config_keys_full)
+                ).copy(),
+                config_keys_full,
+                threshold,
+            ),
+        )
+        if hasattr(pipe, "cfg"):
+            setattr(
+                nlp.get_pipe(pipe_name),
+                "cfg",
+                set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
+            )
+
+        eval_scores = nlp.evaluate(dev_dataset)
+        if scores_key not in eval_scores:
+            wasabi.msg.fail(
+                title=f"Failed to look up score `{scores_key}` in evaluation results.",
+                text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
+                f"available: {list(eval_scores.keys())}",
+                exits=1,
+            )
+        scores[threshold] = eval_scores[scores_key]
+
+        if not isinstance(scores[threshold], (float, int)):
+            wasabi.msg.fail(
+                f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
+                f"scores.",
+                exits=1,
+            )
+        print(
+            wasabi.row(
+                [round(threshold, 3), round(scores[threshold], 3)],
+                widths=table_col_widths,
+            )
+        )
+
+    best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
+
+    # If all scores are identical, emit warning.
+    if len(set(scores.values())) == 1:
+        wasabi.msg.warn(
+            title="All scores are identical. Verify that all settings are correct.",
+            text=""
+            if (
+                not isinstance(pipe, MultiLabel_TextCategorizer)
+                or scores_key in ("cats_macro_f", "cats_micro_f")
+            )
+            else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
+        )
+
+    else:
+        if not silent:
+            print(
+                f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
+            )
+
+    return best_threshold, scores[best_threshold], scores
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,10 +1,13 @@
 from typing import Optional, Dict, Any, Union, List
 import platform
+import pkg_resources
+import json
 from pathlib import Path
 from wasabi import Printer, MarkdownRenderer
 import srsly

 from ._util import app, Arg, Opt, string_to_list
+from .download import get_model_filename, get_latest_version
 from .. import util
 from .. import about

@ -16,6 +19,7 @@ def info_cli(
    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
    silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
    exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
+    url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
    # fmt: on
 ):
    """
@ -23,10 +27,19 @@ def info_cli(
    print its meta information. Flag --markdown prints details in Markdown for easy
    copy-pasting to GitHub issues.

+    Flag --url prints only the download URL of the most recent compatible
+    version of the pipeline.
+
    DOCS: https://spacy.io/api/cli#info
    """
    exclude = string_to_list(exclude)
-    info(model, markdown=markdown, silent=silent, exclude=exclude)
+    info(
+        model,
+        markdown=markdown,
+        silent=silent,
+        exclude=exclude,
+        url=url,
+    )


 def info(
@ -35,11 +48,20 @@ def info(
    markdown: bool = False,
    silent: bool = True,
    exclude: Optional[List[str]] = None,
+    url: bool = False,
 ) -> Union[str, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    if not exclude:
        exclude = []
-    if model:
+    if url:
+        if model is not None:
+            title = f"Download info for pipeline '{model}'"
+            data = info_model_url(model)
+            print(data["download_url"])
+            return data
+        else:
+            msg.fail("--url option requires a pipeline name", exits=1)
+    elif model:
        title = f"Info about pipeline '{model}'"
        data = info_model(model, silent=silent)
    else:
@ -99,11 +121,44 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
        meta["source"] = str(model_path.resolve())
    else:
        meta["source"] = str(model_path)
+    download_url = info_installed_model_url(model)
+    if download_url:
+        meta["download_url"] = download_url
    return {
        k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
    }


+def info_installed_model_url(model: str) -> Optional[str]:
+    """Given a pipeline name, get the download URL if available, otherwise
+    return None.
+
+    This is only available for pipelines installed as modules that have
+    dist-info available.
+    """
+    try:
+        dist = pkg_resources.get_distribution(model)
+        data = json.loads(dist.get_metadata("direct_url.json"))
+        return data["url"]
+    except pkg_resources.DistributionNotFound:
+        # no such package
+        return None
+    except Exception:
+        # something else, like no file or invalid JSON
+        return None
+
+
+def info_model_url(model: str) -> Dict[str, Any]:
+    """Return the download URL for the latest version of a pipeline."""
+    version = get_latest_version(model)
+
+    filename = get_model_filename(model, version)
+    download_url = about.__download_url__ + "/" + filename
+    release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
+    release_url = release_tpl.format(m=model, v=version)
+    return {"download_url": download_url, "release_url": release_url}
+
+
 def get_markdown(
    data: Dict[str, Any],
    title: Optional[str] = None,
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -10,6 +10,7 @@ from jinja2 import Template
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
+from ..util import SimpleFrozenList
 from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
 from ._util import string_to_list, import_code

@ -24,16 +25,30 @@ class Optimizations(str, Enum):
    accuracy = "accuracy"


+class InitValues:
+    """
+    Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
+    init_config(), i.e. initialization calls via CLI respectively Python.
+    """
+
+    lang = "en"
+    pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
+    optimize = Optimizations.efficiency
+    gpu = False
+    pretraining = False
+    force_overwrite = False
+
+
@init_cli.command("config")
 def init_config_cli(
    # fmt: off
    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
-    lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
-    pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
-    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
-    gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
-    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
-    force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
+    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
+    pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
+    optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
+    gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
+    pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
+    force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
    # fmt: on
 ):
    """
@ -133,11 +148,11 @@ def fill_config(

 def init_config(
    *,
-    lang: str,
-    pipeline: List[str],
-    optimize: str,
-    gpu: bool,
-    pretraining: bool = False,
+    lang: str = InitValues.lang,
+    pipeline: List[str] = InitValues.pipeline,
+    optimize: str = InitValues.optimize,
+    gpu: bool = InitValues.gpu,
+    pretraining: bool = InitValues.pretraining,
    silent: bool = True,
 ) -> Config:
    msg = Printer(no_print=silent)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -299,8 +299,8 @@ def get_meta(
    }
    nlp = util.load_model_from_path(Path(model_path))
    meta.update(nlp.meta)
-    meta.update(existing_meta)
    meta["spacy_version"] = util.get_minor_version_range(about.__version__)
+    meta.update(existing_meta)
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -61,7 +61,7 @@ def pretrain_cli(
        # TODO: What's the solution here? How do we handle optional blocks?
        msg.fail("The [pretraining] block in your config is empty", exits=1)
    if not output_dir.exists():
-        output_dir.mkdir()
+        output_dir.mkdir(parents=True)
        msg.good(f"Created output directory: {output_dir}")
    # Save non-interpolated config
    raw_config.to_disk(output_dir / "config.cfg")
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
 from .._util import get_checksum, download_file, git_checkout, get_git_version
 from .._util import SimpleFrozenDict, parse_config_overrides

+# Whether assets are extra if `extra` is not set.
+EXTRA_DEFAULT = False
+

@project_cli.command(
    "assets",
@ -21,7 +24,8 @@ def project_assets_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
+    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
    # fmt: on
 ):
    """Fetch project assets like datasets and pretrained weights. Assets are
@ -32,7 +36,12 @@ def project_assets_cli(
    DOCS: https://spacy.io/api/cli#project-assets
    """
    overrides = parse_config_overrides(ctx.args)
-    project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
+    project_assets(
+        project_dir,
+        overrides=overrides,
+        sparse_checkout=sparse_checkout,
+        extra=extra,
+    )


 def project_assets(
@ -40,17 +49,29 @@ def project_assets(
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    sparse_checkout: bool = False,
+    extra: bool = False,
 ) -> None:
    """Fetch assets for a project using DVC if possible.

    project_dir (Path): Path to project directory.
+    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
+                            needed.
+    extra (bool): Whether to download all assets, including those marked as 'extra'.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path, overrides=overrides)
-    assets = config.get("assets", {})
+    assets = [
+        asset
+        for asset in config.get("assets", [])
+        if extra or not asset.get("extra", EXTRA_DEFAULT)
+    ]
    if not assets:
-        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+        msg.warn(
+            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
+            exits=0,
+        )
    msg.info(f"Fetching {len(assets)} asset(s)")
+
    for asset in assets:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
@ -168,7 +189,11 @@ def convert_asset_url(url: str) -> str:
    RETURNS (str): The converted URL.
    """
    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
+    if (
+        re.match(r"(http(s?)):\/\/github.com", url)
+        and "releases/download" not in url
+        and "/raw/" not in url
+    ):
        converted = url.replace("github.com", "raw.githubusercontent.com")
        converted = re.sub(r"/(tree|blob)/", "/", converted)
        msg.warn(
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -7,11 +7,11 @@ import re
 from ... import about
 from ...util import ensure_path
 from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_checkout, get_git_version
+from .._util import git_checkout, get_git_version, git_repo_branch_exists

 DEFAULT_REPO = about.__projects__
 DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
-DEFAULT_BRANCH = "master"
+DEFAULT_BRANCHES = ["main", "master"]


@project_cli.command("clone")
@ -20,7 +20,7 @@ def project_clone_cli(
    name: str = Arg(..., help="The name of the template to clone"),
    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
-    branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
+    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
    # fmt: on
 ):
@ -33,9 +33,25 @@ def project_clone_cli(
    """
    if dest is None:
        dest = Path.cwd() / Path(name).parts[-1]
+    if repo == DEFAULT_REPO and branch is None:
+        branch = DEFAULT_PROJECTS_BRANCH
+
    if branch is None:
-        # If it's a user repo, we want to default to other branch
-        branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
+        for default_branch in DEFAULT_BRANCHES:
+            if git_repo_branch_exists(repo, default_branch):
+                branch = default_branch
+                break
+        if branch is None:
+            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
+            msg.fail(
+                "No branch provided and attempted default "
+                f"branches {default_branches_msg} do not exist.",
+                exits=1,
+            )
+    else:
+        if not git_repo_branch_exists(repo, branch):
+            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
+    assert isinstance(branch, str)
    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)


@ -61,9 +77,9 @@ def project_clone(
    try:
        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
    except subprocess.CalledProcessError:
-        err = f"Could not clone '{name}' from repo '{repo_name}'"
+        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
        msg.fail(err, exits=1)
-    msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
+    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
    if not (project_dir / PROJECT_FILE).exists():
        msg.warn(f"No {PROJECT_FILE} found in directory")
    else:
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -25,6 +25,7 @@ def project_update_dvc_cli(
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
@ -36,7 +37,7 @@ def project_update_dvc_cli(

    DOCS: https://spacy.io/api/cli#project-dvc
    """
-    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)


 def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
    workflow: Optional[str] = None,
    *,
    verbose: bool = False,
+    quiet: bool = False,
    force: bool = False,
 ) -> None:
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
        If not set, the first workflow will be used.
    verbose (bool): Print more info.
+    quiet (bool): Print less info.
    force (bool): Force update DVC config.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(
-        project_dir, config, workflow, verbose=verbose, force=force
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
    )
    help_msg = "To execute the workflow with DVC, run: dvc repro"
    if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
    config: Dict[str, Any],
    workflow: Optional[str] = None,
    verbose: bool = False,
-    silent: bool = False,
+    quiet: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project.yml.
    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
+    quiet (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
@ -105,6 +108,14 @@ def update_dvc_config(
        dvc_config_path.unlink()
    dvc_commands = []
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
    for name in workflows[workflow]:
        command = config_commands[name]
        deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
        if command.get("no_skip"):
            dvc_cmd.append("--always-changed")
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
    with working_dir(path):
-        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, flags=dvc_flags)
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
    return True


-def run_dvc_commands(
-    commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
-) -> None:
-    """Run a sequence of DVC commands in a subprocess, in order.
-
-    commands (List[str]): The string commands without the leading "dvc".
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    """
-    for c in commands:
-        command = split_command(c)
-        dvc_command = ["dvc", *command]
-        # Add the flags if they are set to True
-        for flag, is_active in flags.items():
-            if is_active:
-                dvc_command.append(flag)
-        run_command(dvc_command)
-
-
 def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
    """Validate workflows provided in project.yml and check that a given
    workflow can be used to generate a DVC config.
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@ -5,14 +5,17 @@ import hashlib
 import urllib.parse
 import tarfile
 from pathlib import Path
+from wasabi import msg

-from .._util import get_hash, get_checksum, download_file, ensure_pathy
-from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
+from .._util import get_hash, get_checksum, upload_file, download_file
+from .._util import ensure_pathy, make_tempdir
+from ...util import get_minor_version, ENV_VARS, check_bool_env_var
 from ...git_info import GIT_VERSION
 from ... import about
+from ...errors import Errors

 if TYPE_CHECKING:
-    from pathy import Pathy  # noqa: F401
+    from pathy import FluidPath  # noqa: F401


 class RemoteStorage:
@ -27,7 +30,7 @@ class RemoteStorage:
        self.url = ensure_pathy(url)
        self.compression = compression

-    def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
+    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
        """Compress a file or directory within a project and upload it to a remote
        storage. If an object exists at the full URL, nothing is done.

@ -48,9 +51,7 @@ class RemoteStorage:
            mode_string = f"w:{self.compression}" if self.compression else "w"
            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                tar_file.add(str(loc), arcname=str(path))
-            with tar_loc.open(mode="rb") as input_file:
-                with url.open(mode="wb") as output_file:
-                    output_file.write(input_file.read())
+            upload_file(tar_loc, url)
        return url

    def pull(
@ -59,7 +60,7 @@ class RemoteStorage:
        *,
        command_hash: Optional[str] = None,
        content_hash: Optional[str] = None,
-    ) -> Optional["Pathy"]:
+    ) -> Optional["FluidPath"]:
        """Retrieve a file from the remote cache. If the file already exists,
        nothing is done.

@ -84,7 +85,23 @@ class RemoteStorage:
                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                    # This requires that the path is added correctly, relative
                    # to root. This is how we set things up in push()
-                    tar_file.extractall(self.root)
+
+                    # Disallow paths outside the current directory for the tar
+                    # file (CVE-2007-4559, directory traversal vulnerability)
+                    def is_within_directory(directory, target):
+                        abs_directory = os.path.abspath(directory)
+                        abs_target = os.path.abspath(target)
+                        prefix = os.path.commonprefix([abs_directory, abs_target])
+                        return prefix == abs_directory
+
+                    def safe_extract(tar, path):
+                        for member in tar.getmembers():
+                            member_path = os.path.join(path, member.name)
+                            if not is_within_directory(path, member_path):
+                                raise ValueError(Errors.E852)
+                        tar.extractall(path)
+
+                    safe_extract(tar_file, self.root)
        return url

    def find(
@ -93,25 +110,37 @@ class RemoteStorage:
        *,
        command_hash: Optional[str] = None,
        content_hash: Optional[str] = None,
-    ) -> Optional["Pathy"]:
+    ) -> Optional["FluidPath"]:
        """Find the best matching version of a file within the storage,
        or `None` if no match can be found. If both the creation and content hash
        are specified, only exact matches will be returned. Otherwise, the most
        recent matching file is preferred.
        """
        name = self.encode_name(str(path))
+        urls = []
        if command_hash is not None and content_hash is not None:
-            url = self.make_url(path, command_hash, content_hash)
+            url = self.url / name / command_hash / content_hash
            urls = [url] if url.exists() else []
        elif command_hash is not None:
-            urls = list((self.url / name / command_hash).iterdir())
+            if (self.url / name / command_hash).exists():
+                urls = list((self.url / name / command_hash).iterdir())
        else:
-            urls = list((self.url / name).iterdir())
-            if content_hash is not None:
-                urls = [url for url in urls if url.parts[-1] == content_hash]
+            if (self.url / name).exists():
+                for sub_dir in (self.url / name).iterdir():
+                    urls.extend(sub_dir.iterdir())
+                if content_hash is not None:
+                    urls = [url for url in urls if url.parts[-1] == content_hash]
+        if len(urls) >= 2:
+            try:
+                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
+            except Exception:
+                msg.warn(
+                    "Unable to sort remote files by last modified. The file(s) "
+                    "pulled from the cache may not be the most recent."
+                )
        return urls[-1] if urls else None

-    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
+    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
        """Construct a URL from a subpath, a creation hash and a content hash."""
        return self.url / self.encode_name(str(path)) / command_hash / content_hash

--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -1,5 +1,8 @@
-from typing import Optional, List, Dict, Sequence, Any, Iterable
+from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
+import os.path
 from pathlib import Path
+
+import pkg_resources
 from wasabi import msg
 from wasabi.util import locale_escape
 import sys
@ -50,6 +53,7 @@ def project_run(
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
+    skip_requirements_check: bool = False,
 ) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
@ -66,11 +70,19 @@ def project_run(
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
+    skip_requirements_check (bool): Whether to skip the requirements check.
    """
    config = load_project_config(project_dir, overrides=overrides)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+
+    req_path = project_dir / "requirements.txt"
+    if not skip_requirements_check:
+        if config.get("check_requirements", True) and os.path.exists(req_path):
+            with req_path.open() as requirements_file:
+                _check_requirements([req.strip() for req in requirements_file])
+
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
@ -81,6 +93,7 @@ def project_run(
                force=force,
                dry=dry,
                capture=capture,
+                skip_requirements_check=True,
            )
    else:
        cmd = commands[subcommand]
@ -88,8 +101,8 @@ def project_run(
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
-                err_kwargs = {"exits": 1} if not dry else {}
-                msg.fail(err, err_help, **err_kwargs)
+                err_exits = 1 if not dry else None
+                msg.fail(err, err_help, exits=err_exits)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
@ -195,6 +208,8 @@ def validate_subcommand(
        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
    if subcommand not in commands and subcommand not in workflows:
        help_msg = []
+        if subcommand in ["assets", "asset"]:
+            help_msg.append("Did you mean to run: python -m spacy project assets?")
        if commands:
            help_msg.append(f"Available commands: {', '.join(commands)}")
        if workflows:
@ -308,3 +323,38 @@ def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional
        md5 = get_checksum(file_path) if file_path.exists() else None
        data.append({"path": path, "md5": md5})
    return data
+
+
+def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
+    """Checks whether requirements are installed and free of version conflicts.
+    requirements (List[str]): List of requirements.
+    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
+        exist.
+    """
+
+    failed_pkgs_msgs: List[str] = []
+    conflicting_pkgs_msgs: List[str] = []
+
+    for req in requirements:
+        try:
+            pkg_resources.require(req)
+        except pkg_resources.DistributionNotFound as dnf:
+            failed_pkgs_msgs.append(dnf.report())
+        except pkg_resources.VersionConflict as vc:
+            conflicting_pkgs_msgs.append(vc.report())
+        except Exception:
+            msg.warn(
+                f"Unable to check requirement: {req} "
+                "Checks are currently limited to requirement specifiers "
+                "(PEP 508)"
+            )
+
+    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
+        msg.warn(
+            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
+            "correctly and you installed all requirements specified in your project's requirements.txt: "
+        )
+        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
+            msg.text(pgk_msg)
+
+    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,8 +1,9 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
 [paths]
 train = null
 dev = null
@ -24,10 +25,10 @@ lang = "{{ lang }}"
 {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
 {%- set with_accuracy = optimize == "accuracy" -%}
 {%- set has_accurate_textcat = has_textcat and with_accuracy -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
-{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
+{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 {%- else -%}
-{%- set full_pipeline = components %}
+{%- set full_pipeline = components -%}
 {%- endif %}
 pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
 batch_size = {{ 128 if hardware == "gpu" else 1000 }}
@ -54,7 +55,7 @@ stride = 96
 factory = "morphologizer"

 [components.morphologizer.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null

 [components.morphologizer.model.tok2vec]
@ -70,7 +71,7 @@ grad_factor = 1.0
 factory = "tagger"

 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null

 [components.tagger.model.tok2vec]
@ -123,6 +124,60 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 {% endif -%}

+{% if "spancat" in components -%}
+[components.spancat]
+factory = "spancat"
+max_positive = null
+scorer = {"@scorers":"spacy.spancat_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.spancat.model]
+@architectures = "spacy.SpanCategorizer.v1"
+
+[components.spancat.model.reducer]
+@layers = "spacy.mean_max_reducer.v1"
+hidden_size = 128
+
+[components.spancat.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = null
+nI = null
+
+[components.spancat.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.spancat.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+[components.spancat.suggester]
+@misc = "spacy.ngram_suggester.v1"
+sizes = [1,2,3]
+{% endif -%}
+
+{% if "trainable_lemmatizer" in components -%}
+[components.trainable_lemmatizer]
+factory = "trainable_lemmatizer"
+backoff = "orth"
+min_tree_freq = 3
+overwrite = false
+scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
+top_k = 1
+
+[components.trainable_lemmatizer.model]
+@architectures = "spacy.Tagger.v2"
+nO = null
+normalize = false
+
+[components.trainable_lemmatizer.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.trainable_lemmatizer.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
@ -131,7 +186,7 @@ incl_context = true
 incl_prior = true

 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null

 [components.entity_linker.model.tok2vec]
@ -216,13 +271,8 @@ factory = "tok2vec"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
-{% if has_letters -%}
 attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-rows = [5000, 2500, 2500, 2500]
-{% else -%}
-attrs = ["ORTH", "SHAPE"]
-rows = [5000, 2500]
-{% endif -%}
+rows = [5000, 1000, 2500, 2500]
 include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}

 [components.tok2vec.model.encode]
@ -238,7 +288,7 @@ maxout_pieces = 3
 factory = "morphologizer"

 [components.morphologizer.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null

 [components.morphologizer.model.tok2vec]
@ -251,7 +301,7 @@ width = ${components.tok2vec.model.encode.width}
 factory = "tagger"

 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null

 [components.tagger.model.tok2vec]
@ -295,6 +345,54 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}

+{% if "spancat" in components %}
+[components.spancat]
+factory = "spancat"
+max_positive = null
+scorer = {"@scorers":"spacy.spancat_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.spancat.model]
+@architectures = "spacy.SpanCategorizer.v1"
+
+[components.spancat.model.reducer]
+@layers = "spacy.mean_max_reducer.v1"
+hidden_size = 128
+
+[components.spancat.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = null
+nI = null
+
+[components.spancat.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+
+[components.spancat.suggester]
+@misc = "spacy.ngram_suggester.v1"
+sizes = [1,2,3]
+{% endif %}
+
+{% if "trainable_lemmatizer" in components -%}
+[components.trainable_lemmatizer]
+factory = "trainable_lemmatizer"
+backoff = "orth"
+min_tree_freq = 3
+overwrite = false
+scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
+top_k = 1
+
+[components.trainable_lemmatizer.model]
+@architectures = "spacy.Tagger.v2"
+nO = null
+normalize = false
+
+[components.trainable_lemmatizer.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif -%}
+
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
@ -303,7 +401,7 @@ incl_context = true
 incl_prior = true

 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null

 [components.entity_linker.model.tok2vec]
@ -369,7 +467,7 @@ no_output_layer = false
 {% endif %}

 {% for pipe in components %}
-{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
+{% if pipe not in listener_components %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"
--- a/spacy/cli/templates/quickstart_training_recommendations.yml
+++ b/spacy/cli/templates/quickstart_training_recommendations.yml
@ -37,6 +37,15 @@ bn:
    accuracy:
      name: sagorsarker/bangla-bert-base
      size_factor: 3
+ca:
+  word_vectors: null
+  transformer:
+    efficiency:
+      name: projecte-aina/roberta-base-ca-v2
+      size_factor: 3
+    accuracy:
+      name: projecte-aina/roberta-base-ca-v2
+      size_factor: 3
 da:
  word_vectors: da_core_news_lg
  transformer:
@ -271,4 +280,3 @@ zh:
    accuracy:
      name: bert-base-chinese
      size_factor: 3
-  has_letters: false
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 # Optional callback before nlp object is saved to disk after training
 before_to_disk = null
+# Optional callback that is invoked at the start of each training step
+before_update = null

 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -7,10 +7,11 @@ USAGE: https://spacy.io/usage/visualizers
 from typing import Union, Iterable, Optional, Dict, Any, Callable
 import warnings

-from .render import DependencyRenderer, EntityRenderer
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 from ..tokens import Doc, Span
 from ..errors import Errors, Warnings
 from ..util import is_in_jupyter
+from ..util import find_available_port


 _html = {}
@ -36,7 +37,7 @@ def render(
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
-    RETURNS (str): Rendered HTML markup.
+    RETURNS (str): Rendered SVG or HTML markup.

    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers
@ -44,6 +45,7 @@ def render(
    factories = {
        "dep": (DependencyRenderer, parse_deps),
        "ent": (EntityRenderer, parse_ents),
+        "span": (SpanRenderer, parse_spans),
    }
    if style not in factories:
        raise ValueError(Errors.E087.format(style=style))
@ -55,6 +57,10 @@ def render(
    renderer_func, converter = factories[style]
    renderer = renderer_func(options=options)
    parsed = [converter(doc, options) for doc in docs] if not manual else docs  # type: ignore
+    if manual:
+        for doc in docs:
+            if isinstance(doc, dict) and "ents" in doc:
+                doc["ents"] = sorted(doc["ents"], key=lambda x: (x["start"], x["end"]))
    _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()  # type: ignore
    html = _html["parsed"]
    if RENDER_WRAPPER is not None:
@ -77,6 +83,7 @@ def serve(
    manual: bool = False,
    port: int = 5000,
    host: str = "0.0.0.0",
+    auto_select_port: bool = False,
 ) -> None:
    """Serve displaCy visualisation.

@ -88,12 +95,15 @@ def serve(
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    port (int): Port to serve visualisation.
    host (str): Host to serve visualisation.
+    auto_select_port (bool): Automatically select a port if the specified port is in use.

    DOCS: https://spacy.io/api/top-level#displacy.serve
    USAGE: https://spacy.io/usage/visualizers
    """
    from wsgiref import simple_server

+    port = find_available_port(port, host, auto_select_port)
+
    if is_in_jupyter():
        warnings.warn(Warnings.W011)
    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
@ -118,7 +128,8 @@ def app(environ, start_response):
 def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
    """Generate dependency parse in {'words': [], 'arcs': []} format.

-    doc (Doc): Document do parse.
+    orig_doc (Doc): Document to parse.
+    options (Dict[str, Any]): Dependency parse specific visualisation options.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
    doc = Doc(orig_doc.vocab).from_bytes(
@ -203,6 +214,43 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
    return {"text": doc.text, "ents": ents, "title": title, "settings": settings}


+def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
+    """Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
+
+    doc (Doc): Document to parse.
+    options (Dict[str, any]): Span-specific visualisation options.
+    RETURNS (dict): Generated span types keyed by text (original text) and spans.
+    """
+    kb_url_template = options.get("kb_url_template", None)
+    spans_key = options.get("spans_key", "sc")
+    spans = [
+        {
+            "start": span.start_char,
+            "end": span.end_char,
+            "start_token": span.start,
+            "end_token": span.end,
+            "label": span.label_,
+            "kb_id": span.kb_id_ if span.kb_id_ else "",
+            "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
+        }
+        for span in doc.spans.get(spans_key, [])
+    ]
+    tokens = [token.text for token in doc]
+
+    if not spans:
+        keys = list(doc.spans.keys())
+        warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
+    title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
+    settings = get_doc_settings(doc)
+    return {
+        "text": doc.text,
+        "spans": spans,
+        "title": title,
+        "settings": settings,
+        "tokens": tokens,
+    }
+
+
 def set_render_wrapper(func: Callable[[str], str]) -> None:
    """Set an optional wrapper function that is called around the generated
    HTML markup on displacy.render. This can be used to allow integration into
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,12 +1,15 @@
-from typing import Dict, Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import uuid
+import itertools

-from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
-from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
-from .templates import TPL_ENTS, TPL_KB_LINK
-from ..util import minify_html, escape_html, registry
 from ..errors import Errors
-
+from ..util import escape_html, minify_html, registry
+from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
+from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
+from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
+from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
+from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
+from .templates import TPL_TITLE

 DEFAULT_LANG = "en"
 DEFAULT_DIR = "ltr"
@ -33,6 +36,224 @@ DEFAULT_LABEL_COLORS = {
 }


+class SpanRenderer:
+    """Render Spans as SVGs."""
+
+    style = "span"
+
+    def __init__(self, options: Dict[str, Any] = {}) -> None:
+        """Initialise span renderer
+
+        options (dict): Visualiser-specific options (colors, spans)
+        """
+        # Set up the colors and overall look
+        colors = dict(DEFAULT_LABEL_COLORS)
+        user_colors = registry.displacy_colors.get_all()
+        for user_color in user_colors.values():
+            if callable(user_color):
+                # Since this comes from the function registry, we want to make
+                # sure we support functions that *return* a dict of colors
+                user_color = user_color()
+            if not isinstance(user_color, dict):
+                raise ValueError(Errors.E925.format(obj=type(user_color)))
+            colors.update(user_color)
+        colors.update(options.get("colors", {}))
+        self.default_color = DEFAULT_ENTITY_COLOR
+        self.colors = {label.upper(): color for label, color in colors.items()}
+
+        # Set up how the text and labels will be rendered
+        self.direction = DEFAULT_DIR
+        self.lang = DEFAULT_LANG
+        # These values are in px
+        self.top_offset = options.get("top_offset", 40)
+        # This is how far under the top offset the span labels appear
+        self.span_label_offset = options.get("span_label_offset", 20)
+        self.offset_step = options.get("top_offset_step", 17)
+
+        # Set up which templates will be used
+        template = options.get("template")
+        if template:
+            self.span_template = template["span"]
+            self.span_slice_template = template["slice"]
+            self.span_start_template = template["start"]
+        else:
+            if self.direction == "rtl":
+                self.span_template = TPL_SPAN_RTL
+                self.span_slice_template = TPL_SPAN_SLICE_RTL
+                self.span_start_template = TPL_SPAN_START_RTL
+            else:
+                self.span_template = TPL_SPAN
+                self.span_slice_template = TPL_SPAN_SLICE
+                self.span_start_template = TPL_SPAN_START
+
+    def render(
+        self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
+    ) -> str:
+        """Render complete markup.
+
+        parsed (list): Dependency parses to render.
+        page (bool): Render parses wrapped as full HTML page.
+        minify (bool): Minify HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
+        """
+        rendered = []
+        for i, p in enumerate(parsed):
+            if i == 0:
+                settings = p.get("settings", {})
+                self.direction = settings.get("direction", DEFAULT_DIR)
+                self.lang = settings.get("lang", DEFAULT_LANG)
+            rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
+
+        if page:
+            docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
+            markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
+        else:
+            markup = "".join(rendered)
+        if minify:
+            return minify_html(markup)
+        return markup
+
+    def render_spans(
+        self,
+        tokens: List[str],
+        spans: List[Dict[str, Any]],
+        title: Optional[str],
+    ) -> str:
+        """Render span types in text.
+
+        Spans are rendered per-token, this means that for each token, we check if it's part
+        of a span slice (a member of a span type) or a span start (the starting token of a
+        given span type).
+
+        tokens (list): Individual tokens in the text
+        spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
+        title (str / None): Document title set in Doc.user_data['title'].
+        """
+        per_token_info = []
+        # we must sort so that we can correctly describe when spans need to "stack"
+        # which is determined by their start token, then span length (longer spans on top),
+        # then break any remaining ties with the span label
+        spans = sorted(
+            spans,
+            key=lambda s: (
+                s["start_token"],
+                -(s["end_token"] - s["start_token"]),
+                s["label"],
+            ),
+        )
+        for s in spans:
+            # this is the vertical 'slot' that the span will be rendered in
+            # vertical_position = span_label_offset + (offset_step * (slot - 1))
+            s["render_slot"] = 0
+        for idx, token in enumerate(tokens):
+            # Identify if a token belongs to a Span (and which) and if it's a
+            # start token of said Span. We'll use this for the final HTML render
+            token_markup: Dict[str, Any] = {}
+            token_markup["text"] = token
+            concurrent_spans = 0
+            entities = []
+            for span in spans:
+                ent = {}
+                if span["start_token"] <= idx < span["end_token"]:
+                    concurrent_spans += 1
+                    span_start = idx == span["start_token"]
+                    ent["label"] = span["label"]
+                    ent["is_start"] = span_start
+                    if span_start:
+                        # When the span starts, we need to know how many other
+                        # spans are on the 'span stack' and will be rendered.
+                        # This value becomes the vertical render slot for this entire span
+                        span["render_slot"] = concurrent_spans
+                    ent["render_slot"] = span["render_slot"]
+                    kb_id = span.get("kb_id", "")
+                    kb_url = span.get("kb_url", "#")
+                    ent["kb_link"] = (
+                        TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
+                    )
+                    entities.append(ent)
+                else:
+                    # We don't specifically need to do this since we loop
+                    # over tokens and spans sorted by their start_token,
+                    # so we'll never use a span again after the last token it appears in,
+                    # but if we were to use these spans again we'd want to make sure
+                    # this value was reset correctly.
+                    span["render_slot"] = 0
+            token_markup["entities"] = entities
+            per_token_info.append(token_markup)
+        markup = self._render_markup(per_token_info)
+        markup = TPL_SPANS.format(content=markup, dir=self.direction)
+        if title:
+            markup = TPL_TITLE.format(title=title) + markup
+        return markup
+
+    def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
+        """Render the markup from per-token information"""
+        markup = ""
+        for token in per_token_info:
+            entities = sorted(token["entities"], key=lambda d: d["render_slot"])
+            # Whitespace tokens disrupt the vertical space (no line height) so that the
+            # span indicators get misaligned. We don't render them as individual
+            # tokens anyway, so we'll just not display a span indicator either.
+            is_whitespace = token["text"].strip() == ""
+            if entities and not is_whitespace:
+                slices = self._get_span_slices(token["entities"])
+                starts = self._get_span_starts(token["entities"])
+                total_height = (
+                    self.top_offset
+                    + self.span_label_offset
+                    + (self.offset_step * (len(entities) - 1))
+                )
+                markup += self.span_template.format(
+                    text=token["text"],
+                    span_slices=slices,
+                    span_starts=starts,
+                    total_height=total_height,
+                )
+            else:
+                markup += escape_html(token["text"] + " ")
+        return markup
+
+    def _get_span_slices(self, entities: List[Dict]) -> str:
+        """Get the rendered markup of all Span slices"""
+        span_slices = []
+        for entity in entities:
+            # rather than iterate over multiples of offset_step, we use entity['render_slot']
+            # to determine the vertical position, since that tells where
+            # the span starts vertically so we can extend it horizontally,
+            # past other spans that might have already ended
+            color = self.colors.get(entity["label"].upper(), self.default_color)
+            top_offset = self.top_offset + (
+                self.offset_step * (entity["render_slot"] - 1)
+            )
+            span_slice = self.span_slice_template.format(
+                bg=color,
+                top_offset=top_offset,
+            )
+            span_slices.append(span_slice)
+        return "".join(span_slices)
+
+    def _get_span_starts(self, entities: List[Dict]) -> str:
+        """Get the rendered markup of all Span start tokens"""
+        span_starts = []
+        for entity in entities:
+            color = self.colors.get(entity["label"].upper(), self.default_color)
+            top_offset = self.top_offset + (
+                self.offset_step * (entity["render_slot"] - 1)
+            )
+            span_start = (
+                self.span_start_template.format(
+                    bg=color,
+                    top_offset=top_offset,
+                    label=entity["label"],
+                    kb_link=entity["kb_link"],
+                )
+                if entity["is_start"]
+                else ""
+            )
+            span_starts.append(span_start)
+        return "".join(span_starts)
+
+
 class DependencyRenderer:
    """Render dependency parses as SVGs."""

@ -105,7 +326,7 @@ class DependencyRenderer:
        RETURNS (str): Rendered SVG markup.
        """
        self.levels = self.get_levels(arcs)
-        self.highest_level = len(self.levels)
+        self.highest_level = max(self.levels.values(), default=0)
        self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
        self.width = self.offset_x + len(words) * self.distance
        self.height = self.offset_y + 3 * self.word_spacing
@ -165,7 +386,7 @@ class DependencyRenderer:
        if start < 0 or end < 0:
            error_args = dict(start=start, end=end, label=label, dir=direction)
            raise ValueError(Errors.E157.format(**error_args))
-        level = self.levels.index(end - start) + 1
+        level = self.levels[(start, end, label)]
        x_start = self.offset_x + start * self.distance + self.arrow_spacing
        if self.direction == "rtl":
            x_start = self.width - x_start
@ -181,7 +402,7 @@ class DependencyRenderer:
        y_curve = self.offset_y - level * self.distance / 2
        if self.compact:
            y_curve = self.offset_y - level * self.distance / 6
-        if y_curve == 0 and len(self.levels) > 5:
+        if y_curve == 0 and max(self.levels.values(), default=0) > 5:
            y_curve = -self.distance
        arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
        arc = self.get_arc(x_start, y, y_curve, x_end)
@ -225,15 +446,23 @@ class DependencyRenderer:
            p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
        return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"

-    def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
+    def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
        """Calculate available arc height "levels".
        Used to calculate arrow heights dynamically and without wasting space.

        args (list): Individual arcs and their start, end, direction and label.
-        RETURNS (list): Arc levels sorted from lowest to highest.
+        RETURNS (dict): Arc levels keyed by (start, end, label).
        """
-        levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
-        return sorted(list(levels))
+        arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
+        length = max([arc["end"] for arc in arcs], default=0)
+        max_level = [0] * length
+        levels = {}
+        for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
+            level = max(max_level[arc["start"] : arc["end"]]) + 1
+            for i in range(arc["start"], arc["end"]):
+                max_level[i] = level
+            levels[(arc["start"], arc["end"], arc["label"])] = level
+        return levels


 class EntityRenderer:
@ -242,7 +471,7 @@ class EntityRenderer:
    style = "ent"

    def __init__(self, options: Dict[str, Any] = {}) -> None:
-        """Initialise dependency renderer.
+        """Initialise entity renderer.

        options (dict): Visualiser-specific options (colors, ents)
        """
@ -281,7 +510,7 @@ class EntityRenderer:
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (str): Rendered HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -62,6 +62,55 @@ TPL_ENT_RTL = """
 </mark>
 """

+TPL_SPANS = """
+<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
+"""
+
+TPL_SPAN = """
+<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
+    {text}
+    {span_slices}
+    {span_starts}
+</span>
+"""
+
+TPL_SPAN_SLICE = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+</span>
+"""
+
+
+TPL_SPAN_START = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+    <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+        {label}{kb_link}
+    </span>
+</span>
+
+"""
+
+TPL_SPAN_RTL = """
+<span style="font-weight: bold; display: inline-block; position: relative;">
+    {text}
+    {span_slices}
+    {span_starts}
+</span>
+"""
+
+TPL_SPAN_SLICE_RTL = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+</span>
+"""
+
+TPL_SPAN_START_RTL = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+    <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+        {label}{kb_link}
+    </span>
+</span>
+"""
+
+
 # Important: this needs to start with a space!
 TPL_KB_LINK = """
 <a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,4 +1,5 @@
 import warnings
+from .compat import Literal


 class ErrorsWithCodes(type):
@ -15,8 +16,8 @@ def setup_default_warnings():
    filter_warning("ignore", error_msg="numpy.dtype size changed")  # noqa
    filter_warning("ignore", error_msg="numpy.ufunc size changed")  # noqa

-    # warn about entity_ruler & matcher having no patterns only once
-    for pipe in ["matcher", "entity_ruler"]:
+    # warn about entity_ruler, span_ruler & matcher having no patterns only once
+    for pipe in ["matcher", "entity_ruler", "span_ruler"]:
        filter_warning("once", error_msg=Warnings.W036.format(name=pipe))

    # warn once about lemmatizer without required POS
@ -26,7 +27,10 @@ def setup_default_warnings():
    filter_warning("once", error_msg="[W114]")


-def filter_warning(action: str, error_msg: str):
+def filter_warning(
+    action: Literal["default", "error", "ignore", "always", "module", "once"],
+    error_msg: str,
+):
    """Customize how spaCy should handle a certain warning.

    error_msg (str): e.g. "W006", or a full error message
@ -192,6 +196,25 @@ class Warnings(metaclass=ErrorsWithCodes):
    W115 = ("Skipping {method}: the floret vector table cannot be modified. "
            "Vectors are calculated from character ngrams.")
    W116 = ("Unable to clean attribute '{attr}'.")
+    W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
+            "surprising to you, make sure the Doc was processed using a model "
+            "that supports span categorization, and check the `doc.spans[spans_key]` "
+            "property manually if necessary.\n\nAvailable keys: {keys}")
+    W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
+            "for the corpora used to train the language. Please check "
+            "`nlp.meta[\"sources\"]` for any relevant links.")
+    W119 = ("Overriding pipe name in `config` is not supported. Ignoring override '{name_in_config}'.")
+    W120 = ("Unable to load all spans in Doc.spans: more than one span group "
+            "with the name '{group_name}' was found in the saved spans data. "
+            "Only the last span group will be loaded under "
+            "Doc.spans['{group_name}']. Skipping span group with values: "
+            "{group_values}")
+    W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
+    W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
+            "is a Cython extension type.")
+    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
+            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
+    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")


 class Errors(metaclass=ErrorsWithCodes):
@ -210,8 +233,9 @@ class Errors(metaclass=ErrorsWithCodes):
            "initialized component.")
    E004 = ("Can't set up pipeline component: a factory for '{name}' already "
            "exists. Existing factory: {func}. New factory: {new_func}")
-    E005 = ("Pipeline component '{name}' returned None. If you're using a "
-            "custom component, maybe you forgot to return the processed Doc?")
+    E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
+            "Doc. If you're using a custom component, maybe you forgot to "
+            "return the processed Doc?")
    E006 = ("Invalid constraints for adding pipeline component. You can only "
            "set one of the following: before (component name or index), "
            "after (component name or index), first (True) or last (True). "
@ -322,6 +346,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
@ -369,7 +398,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "consider using doc.spans instead.")
    E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
            "settings: {opts}")
-    E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
+    E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
    E109 = ("Component '{name}' could not be run. Did you forget to "
            "call `initialize()`?")
    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
@ -437,10 +466,10 @@ class Errors(metaclass=ErrorsWithCodes):
            "same, but found '{nlp}' and '{vocab}' respectively.")
    E152 = ("The attribute {attr} is not supported for token patterns. "
            "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "or EntityRuler for more details.")
+            "EntityRuler or AttributeRuler for more details.")
    E153 = ("The value type {vtype} is not supported for token patterns. "
            "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "or EntityRuler for more details.")
+            "EntityRuler or AttributeRuler for more details.")
    E154 = ("One of the attributes or values is not supported for token "
            "patterns. Please use the option `validate=True` with the Matcher, "
            "PhraseMatcher, or EntityRuler for more details.")
@ -515,15 +544,28 @@ class Errors(metaclass=ErrorsWithCodes):
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
            "table, which contains {n_rows} vectors.")
    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
-    E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
-            "issue tracker: http://github.com/explosion/spaCy/issues")
+    E200 = ("Can't set {attr} from Span.")
    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+    E203 = ("If the {name} embedding layer is not updated "
+            "during training, make sure to include it in 'annotating components'")

    # New errors added in v3.x
+    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
+            "but found value of '{val}'.")
+    E852 = ("The tar file pulled from the remote attempted an unsafe path "
+            "traversal.")
+    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
+            "not permitted in factory names.")
+    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
+            "permit overlapping spans.")
+    E855 = ("Invalid {obj}: {obj} is not from the same doc.")
+    E856 = ("Error accessing span at position {i}: out of bounds in span group "
+            "of length {length}.")
+    E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
    E858 = ("The {mode} vector table does not support this operation. "
            "{alternative}")
    E859 = ("The floret vector table cannot be modified.")
-    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E860 = ("Can't truncate floret vectors.")
    E861 = ("No 'keys' should be provided when initializing floret vectors "
            "with 'minn' and 'maxn'.")
    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
@ -679,11 +721,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "need to modify the pipeline, use the built-in methods like "
            "`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
            "`nlp.enable_pipe` instead.")
-    E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
+    E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
            "property or default function argument?")
-    E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
+    E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
            "but the provided argument {loc} points to a file.")
-    E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
+    E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
    E930 = ("Received invalid get_examples callback in `{method}`. "
            "Expected function that returns an iterable of Example objects but "
            "got: {obj}")
@ -887,10 +929,45 @@ class Errors(metaclass=ErrorsWithCodes):
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
             "exist.")
-    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
-             "patterns.")
+    E1024 = ("A pattern with {attr_type} '{label}' is not present in "
+             "'{component}' patterns.")
    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
             "supported values are: 'I', 'O', 'B' and ''")
+    E1026 = ("Edit tree has an invalid format:\n{errors}")
+    E1027 = ("AlignmentArray only supports slicing with a step of 1.")
+    E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
+    E1029 = ("Edit tree cannot be applied to form.")
+    E1030 = ("Edit tree identifier out of range.")
+    E1031 = ("Could not find gold transition - see logs above.")
+    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
+    E1034 = ("Node index {i} out of bounds ({length})")
+    E1035 = ("Token index {i} out of bounds ({length})")
+    E1036 = ("Cannot index into NoneNode")
+    E1037 = ("Invalid attribute value '{attr}'.")
+    E1038 = ("Invalid JSON input: {message}")
+    E1039 = ("The {obj} start or end annotations (start: {start}, end: {end}) "
+             "could not be aligned to token boundaries.")
+    E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
+             "Some tokens do not contain annotation for: {partial_attrs}")
+    E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
+    E1042 = ("`enable={enable}` and `disable={disable}` are inconsistent with each other.\nIf you only passed "
+             "one of `enable` or `disable`, the other argument is specified in your pipeline's configuration.\nIn that "
+             "case pass an empty list for the previously not specified argument to avoid this error.")
+    E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
+             "{value}.")
+    E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
+    E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
+             "method in '{name}'. If you want to use this method, make "
+             "sure it's overwritten on the subclass.")
+    E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
+             "knowledge base, use `InMemoryLookupKB`.")
+    E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
+    E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
+    E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
+             "with `displacy.serve(doc, port=port)`")
+    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
+             "or use `auto_switch_port=True` to pick an available port automatically.")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -1,3 +1,7 @@
+import warnings
+from .errors import Warnings
+
+
 def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.

@ -11,6 +15,8 @@ def explain(term):
    """
    if term in GLOSSARY:
        return GLOSSARY[term]
+    else:
+        warnings.warn(Warnings.W118.format(term=term))


 GLOSSARY = {
@ -267,6 +273,7 @@ GLOSSARY = {
    "relcl": "relative clause modifier",
    "reparandum": "overridden disfluency",
    "root": "root",
+    "ROOT": "root",
    "vocative": "vocative",
    "xcomp": "open clausal complement",
    # Dependency labels (German)
--- a/spacy/kb/init.py
+++ b/spacy/kb/init.py
@ -0,0 +1,3 @@
+from .kb import KnowledgeBase
+from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, get_candidates, get_candidates_batch
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@ -0,0 +1,12 @@
+from .kb cimport KnowledgeBase
+from libcpp.vector cimport vector
+from ..typedefs cimport hash_t
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+cdef class Candidate:
+    cdef readonly KnowledgeBase kb
+    cdef hash_t entity_hash
+    cdef float entity_freq
+    cdef vector[float] entity_vector
+    cdef hash_t alias_hash
+    cdef float prior_prob
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@ -0,0 +1,74 @@
+# cython: infer_types=True, profile=True
+
+from typing import Iterable
+from .kb cimport KnowledgeBase
+from ..tokens import Span
+
+cdef class Candidate:
+    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
+    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate (alias, entity) pair is assigned a certain prior probability.
+
+    DOCS: https://spacy.io/api/kb/#candidate-init
+    """
+
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+        self.kb = kb
+        self.entity_hash = entity_hash
+        self.entity_freq = entity_freq
+        self.entity_vector = entity_vector
+        self.alias_hash = alias_hash
+        self.prior_prob = prior_prob
+
+    @property
+    def entity(self) -> int:
+        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        return self.entity_hash
+
+    @property
+    def entity_(self) -> str:
+        """RETURNS (str): ID/name of this entity in the KB"""
+        return self.kb.vocab.strings[self.entity_hash]
+
+    @property
+    def alias(self) -> int:
+        """RETURNS (uint64): hash of the alias"""
+        return self.alias_hash
+
+    @property
+    def alias_(self) -> str:
+        """RETURNS (str): ID of the original alias"""
+        return self.kb.vocab.strings[self.alias_hash]
+
+    @property
+    def entity_freq(self) -> float:
+        return self.entity_freq
+
+    @property
+    def entity_vector(self) -> Iterable[float]:
+        return self.entity_vector
+
+    @property
+    def prior_prob(self) -> float:
+        return self.prior_prob
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    """
+    Return candidate entities for a given mention and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Span): Entity mention for which to identify candidates.
+    RETURNS (Iterable[Candidate]): Identified candidates.
+    """
+    return kb.get_candidates(mention)
+
+
+def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+    """
+    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Iterable[Span]): Entity mentions for which to identify candidates.
+    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+    """
+    return kb.get_candidates_batch(mentions)
--- a/spacy/kb/kb.pxd
+++ b/spacy/kb/kb.pxd
@ -0,0 +1,10 @@
+"""Knowledge-base for entity or concept linking."""
+
+from cymem.cymem cimport Pool
+from libc.stdint cimport int64_t
+from ..vocab cimport Vocab
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+    cdef readonly Vocab vocab
+    cdef readonly int64_t entity_vector_length
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@ -0,0 +1,108 @@
+# cython: infer_types=True, profile=True
+
+from pathlib import Path
+from typing import Iterable, Tuple, Union
+from cymem.cymem cimport Pool
+
+from .candidate import Candidate
+from ..tokens import Span
+from ..util import SimpleFrozenList
+from ..errors import Errors
+
+
+cdef class KnowledgeBase:
+    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+    to support entity linking of named entities to real-world concepts.
+    This is an abstract class and requires its operations to be implemented.
+
+    DOCS: https://spacy.io/api/kb
+    """
+
+    def __init__(self, vocab: Vocab, entity_vector_length: int):
+        """Create a KnowledgeBase."""
+        # Make sure abstract KB is not instantiated.
+        if self.__class__ == KnowledgeBase:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
+
+        self.vocab = vocab
+        self.entity_vector_length = entity_vector_length
+        self.mem = Pool()
+
+    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+        """
+        Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
+        and the prior probability of that alias resolving to that entity.
+        If no candidate is found for a given text, an empty list is returned.
+        mentions (Iterable[Span]): Mentions for which to get candidates.
+        RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+        """
+        return [self.get_candidates(span) for span in mentions]
+
+    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+        """
+        Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+        and the prior probability of that alias resolving to that entity.
+        If the no candidate is found for a given text, an empty list is returned.
+        mention (Span): Mention for which to get candidates.
+        RETURNS (Iterable[Candidate]): Identified candidates.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
+        )
+
+    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
+        """
+        Return vectors for entities.
+        entity (str): Entity name/ID.
+        RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
+        """
+        return [self.get_vector(entity) for entity in entities]
+
+    def get_vector(self, str entity) -> Iterable[float]:
+        """
+        Return vector for entity.
+        entity (str): Entity name/ID.
+        RETURNS (Iterable[float]): Vector for specified entity.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
+        )
+
+    def to_bytes(self, **kwargs) -> bytes:
+        """Serialize the current state to a binary string.
+        RETURNS (bytes): Current state as binary string.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
+        )
+
+    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
+        """Load state from a binary string.
+        bytes_data (bytes): KB state.
+        exclude (Tuple[str]): Properties to exclude when restoring KB.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
+        )
+
+    def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+        """
+        Write KnowledgeBase content to disk.
+        path (Union[str, Path]): Target file path.
+        exclude (Iterable[str]): List of components to exclude.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
+        )
+
+    def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+        """
+        Load KnowledgeBase content from disk.
+        path (Union[str, Path]): Target file path.
+        exclude (Iterable[str]): List of components to exclude.
+        """
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
+        )
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@ -1,14 +1,12 @@
 """Knowledge-base for entity or concept linking."""
-from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE

-from .vocab cimport Vocab
-from .typedefs cimport hash_t
-from .structs cimport KBEntryC, AliasC
-
+from ..typedefs cimport hash_t
+from ..structs cimport KBEntryC, AliasC
+from .kb cimport KnowledgeBase

 ctypedef vector[KBEntryC] entry_vec
 ctypedef vector[AliasC] alias_vec
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
 ctypedef vector[float_vec] float_matrix


-# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
-cdef class Candidate:
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
-    cdef float entity_freq
-    cdef vector[float] entity_vector
-    cdef hash_t alias_hash
-    cdef float prior_prob
-
-
-cdef class KnowledgeBase:
-    cdef Pool mem
-    cdef readonly Vocab vocab
-    cdef int64_t entity_vector_length
-
+cdef class InMemoryLookupKB(KnowledgeBase):
    # This maps 64bit keys (hash of unique entity string)
    # to 64bit values (position of the _KBEntryC struct in the _entries vector).
    # The PreshMap is pretty space efficient, as it uses open addressing. So
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True
-from typing import Iterator, Iterable, Callable, Dict, Any
+from typing import Iterable, Callable, Dict, Any, Union

 import srsly
-from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from cpython.exc cimport PyErr_SetFromErrno
 from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
@ -12,103 +11,41 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings

-from .typedefs cimport hash_t
-from .errors import Errors, Warnings
-from . import util
-from .util import SimpleFrozenList, ensure_path
-
-cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
-    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
-    algorithm which will disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned to a certain prior probability.
-
-    DOCS: https://spacy.io/api/kb/#candidate_init
-    """
-
-    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
-        self.kb = kb
-        self.entity_hash = entity_hash
-        self.entity_freq = entity_freq
-        self.entity_vector = entity_vector
-        self.alias_hash = alias_hash
-        self.prior_prob = prior_prob
-
-    @property
-    def entity(self):
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
-        return self.entity_hash
-
-    @property
-    def entity_(self):
-        """RETURNS (str): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_hash]
-
-    @property
-    def alias(self):
-        """RETURNS (uint64): hash of the alias"""
-        return self.alias_hash
-
-    @property
-    def alias_(self):
-        """RETURNS (str): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias_hash]
-
-    @property
-    def entity_freq(self):
-        return self.entity_freq
-
-    @property
-    def entity_vector(self):
-        return self.entity_vector
-
-    @property
-    def prior_prob(self):
-        return self.prior_prob
+from ..tokens import Span
+from ..typedefs cimport hash_t
+from ..errors import Errors, Warnings
+from .. import util
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab cimport Vocab
+from .kb cimport KnowledgeBase
+from .candidate import Candidate as Candidate


-def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
-    """
-    Return candidate entities for a given span by using the text of the span as the alias
-    and fetching appropriate entries from the index.
-    This particular function is optimized to work with the built-in KB functionality,
-    but any other custom candidate generation method can be used in combination with the KB as well.
-    """
-    return kb.get_alias_candidates(span.text)
-
-
-cdef class KnowledgeBase:
-    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+cdef class InMemoryLookupKB(KnowledgeBase):
+    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
    to support entity linking of named entities to real-world concepts.

-    DOCS: https://spacy.io/api/kb
+    DOCS: https://spacy.io/api/inmemorylookupkb
    """

    def __init__(self, Vocab vocab, entity_vector_length):
-        """Create a KnowledgeBase."""
-        self.mem = Pool()
-        self.entity_vector_length = entity_vector_length
+        """Create an InMemoryLookupKB."""
+        super().__init__(vocab, entity_vector_length)
        self._entry_index = PreshMap()
        self._alias_index = PreshMap()
-        self.vocab = vocab
        self._create_empty_vectors(dummy_hash=self.vocab.strings[""])

-    def initialize_entities(self, int64_t nr_entities):
+    def _initialize_entities(self, int64_t nr_entities):
        self._entry_index = PreshMap(nr_entities + 1)
        self._entries = entry_vec(nr_entities + 1)

-    def initialize_vectors(self, int64_t nr_entities):
+    def _initialize_vectors(self, int64_t nr_entities):
        self._vectors_table = float_matrix(nr_entities + 1)

-    def initialize_aliases(self, int64_t nr_aliases):
+    def _initialize_aliases(self, int64_t nr_aliases):
        self._alias_index = PreshMap(nr_aliases + 1)
        self._aliases_table = alias_vec(nr_aliases + 1)

-    @property
-    def entity_vector_length(self):
-        """RETURNS (uint64): length of the entity vectors"""
-        return self.entity_vector_length
-
    def __len__(self):
        return self.get_size_entities()

@ -155,8 +92,8 @@ cdef class KnowledgeBase:
            raise ValueError(Errors.E140)

        nr_entities = len(set(entity_list))
-        self.initialize_entities(nr_entities)
-        self.initialize_vectors(nr_entities)
+        self._initialize_entities(nr_entities)
+        self._initialize_vectors(nr_entities)

        i = 0
        cdef KBEntryC entry
@ -286,7 +223,10 @@ cdef class KnowledgeBase:
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry

-    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
+    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+        return self.get_alias_candidates(mention.text)  # type: ignore
+
+    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
@ -388,9 +328,9 @@ cdef class KnowledgeBase:
            nr_entities = header[0]
            nr_aliases = header[1]
            entity_vector_length = header[2]
-            self.initialize_entities(nr_entities)
-            self.initialize_vectors(nr_entities)
-            self.initialize_aliases(nr_aliases)
+            self._initialize_entities(nr_entities)
+            self._initialize_vectors(nr_entities)
+            self._initialize_aliases(nr_aliases)
            self.entity_vector_length = entity_vector_length

        def deserialize_vectors(b):
@ -512,8 +452,8 @@ cdef class KnowledgeBase:
        cdef int64_t entity_vector_length
        reader.read_header(&nr_entities, &entity_vector_length)

-        self.initialize_entities(nr_entities)
-        self.initialize_vectors(nr_entities)
+        self._initialize_entities(nr_entities)
+        self._initialize_vectors(nr_entities)
        self.entity_vector_length = entity_vector_length

        # STEP 1: load entity vectors
@ -552,7 +492,7 @@ cdef class KnowledgeBase:
        # STEP 3: load aliases
        cdef int64_t nr_aliases
        reader.read_alias_length(&nr_aliases)
-        self.initialize_aliases(nr_aliases)
+        self._initialize_aliases(nr_aliases)

        cdef int64_t nr_candidates
        cdef vector[int64_t] entry_indices
--- a/spacy/lang/bg/init.py
+++ b/spacy/lang/bg/init.py
@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 from ...language import Language, BaseDefaults
 from ...attrs import LANG
 from ...util import update_exc
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):

    stop_words = STOP_WORDS
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
+    infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES


 class Bulgarian(Language):
--- a/spacy/lang/ca/lemmatizer.py
+++ b/spacy/lang/ca/lemmatizer.py
@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
                        oov_forms.append(form)
        if not forms:
            forms.extend(oov_forms)
-        if not forms and string in lookup_table.keys():
-            forms.append(self.lookup_lemmatize(token)[0])
+
+        # use lookups, and fall back to the token itself
        if not forms:
-            forms.append(string)
+            forms.append(lookup_table.get(string, [string])[0])
        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -258,6 +258,10 @@ ALPHA = group_chars(
 ALPHA_LOWER = group_chars(_lower + _uncased)
 ALPHA_UPPER = group_chars(_upper + _uncased)

+_combining_diacritics = r"\u0300-\u036f"
+
+COMBINING_DIACRITICS = _combining_diacritics
+
 _units = (
    "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
    "kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
@ -276,7 +280,7 @@ _currency = (
 _punct = (
    r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ۔ ؛ ٪"
 )
-_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
+_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉 〈 〉  ⟦ ⟧'
 _hyphens = "- – — -- --- —— ~"

 # Various symbols like dingbats, but also emoji
--- a/spacy/lang/dsb/init.py
+++ b/spacy/lang/dsb/init.py
@ -0,0 +1,16 @@
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from ...language import Language, BaseDefaults
+
+
+class LowerSorbianDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+
+
+class LowerSorbian(Language):
+    lang = "dsb"
+    Defaults = LowerSorbianDefaults
+
+
+__all__ = ["LowerSorbian"]
--- a/spacy/lang/dsb/examples.py
+++ b/spacy/lang/dsb/examples.py
@ -0,0 +1,15 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.dsb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
+    "Mi so tu jara derje spodoba.",
+    "Kotre nowniny chceće měć?",
+    "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
+    "Zwóstanjo pótakem hyšći wjele źěła.",
+]
--- a/spacy/lang/dsb/lex_attrs.py
+++ b/spacy/lang/dsb/lex_attrs.py
@ -0,0 +1,113 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "nul",
+    "jaden",
+    "jadna",
+    "jadno",
+    "dwa",
+    "dwě",
+    "tśi",
+    "tśo",
+    "styri",
+    "styrjo",
+    "pěś",
+    "pěśo",
+    "šesć",
+    "šesćo",
+    "sedym",
+    "sedymjo",
+    "wósym",
+    "wósymjo",
+    "źewjeś",
+    "źewjeśo",
+    "źaseś",
+    "źaseśo",
+    "jadnassćo",
+    "dwanassćo",
+    "tśinasćo",
+    "styrnasćo",
+    "pěśnasćo",
+    "šesnasćo",
+    "sedymnasćo",
+    "wósymnasćo",
+    "źewjeśnasćo",
+    "dwanasćo",
+    "dwaźasća",
+    "tśiźasća",
+    "styrźasća",
+    "pěśźaset",
+    "šesćźaset",
+    "sedymźaset",
+    "wósymźaset",
+    "źewjeśźaset",
+    "sto",
+    "tysac",
+    "milion",
+    "miliarda",
+    "bilion",
+    "biliarda",
+    "trilion",
+    "triliarda",
+]
+
+_ordinal_words = [
+    "prědny",
+    "prědna",
+    "prědne",
+    "drugi",
+    "druga",
+    "druge",
+    "tśeśi",
+    "tśeśa",
+    "tśeśe",
+    "stwórty",
+    "stwórta",
+    "stwórte",
+    "pêty",
+    "pěta",
+    "pête",
+    "šesty",
+    "šesta",
+    "šeste",
+    "sedymy",
+    "sedyma",
+    "sedyme",
+    "wósymy",
+    "wósyma",
+    "wósyme",
+    "źewjety",
+    "źewjeta",
+    "źewjete",
+    "źasety",
+    "źaseta",
+    "źasete",
+    "jadnasty",
+    "jadnasta",
+    "jadnaste",
+    "dwanasty",
+    "dwanasta",
+    "dwanaste",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/dsb/stop_words.py
+++ b/spacy/lang/dsb/stop_words.py
@ -0,0 +1,15 @@
+STOP_WORDS = set(
+    """
+a abo aby ako ale až
+
+daniž dokulaž
+
+gaž
+
+jolic
+
+pak pótom
+
+teke togodla
+""".split()
+)
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -35,7 +35,7 @@ for pron in ["i"]:

        _exc[orth + "m"] = [
            {ORTH: orth, NORM: pron},
-            {ORTH: "m", "tenspect": 1, "number": 1},
+            {ORTH: "m"},
        ]

        _exc[orth + "'ma"] = [
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:

 # W-words, relative pronouns, prepositions etc.

-for word in [
-    "who",
-    "what",
-    "when",
-    "where",
-    "why",
-    "how",
-    "there",
-    "that",
-    "this",
-    "these",
-    "those",
+for word, morph in [
+    ("who", None),
+    ("what", None),
+    ("when", None),
+    ("where", None),
+    ("why", None),
+    ("how", None),
+    ("there", None),
+    ("that", "Number=Sing|Person=3"),
+    ("this", "Number=Sing|Person=3"),
+    ("these", "Number=Plur|Person=3"),
+    ("those", "Number=Plur|Person=3"),
 ]:
    for orth in [word, word.title()]:
-        _exc[orth + "'s"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "'s", NORM: "'s"},
-        ]
+        if morph != "Number=Plur|Person=3":
+            _exc[orth + "'s"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "'s", NORM: "'s"},
+            ]

-        _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
+            _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]

        _exc[orth + "'ll"] = [
            {ORTH: orth, NORM: word},
@ -182,25 +183,26 @@ for word in [
            {ORTH: "ve", NORM: "have"},
        ]

-        _exc[orth + "'re"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "'re", NORM: "are"},
-        ]
+        if morph != "Number=Sing|Person=3":
+            _exc[orth + "'re"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "'re", NORM: "are"},
+            ]

-        _exc[orth + "re"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "re", NORM: "are"},
-        ]
+            _exc[orth + "re"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "re", NORM: "are"},
+            ]

-        _exc[orth + "'ve"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "'ve"},
-        ]
+            _exc[orth + "'ve"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "'ve"},
+            ]

-        _exc[orth + "ve"] = [
-            {ORTH: orth},
-            {ORTH: "ve", NORM: "have"},
-        ]
+            _exc[orth + "ve"] = [
+                {ORTH: orth},
+                {ORTH: "ve", NORM: "have"},
+            ]

        _exc[orth + "'d"] = [
            {ORTH: orth, NORM: word},
@ -447,7 +449,6 @@ for exc_data in [
    {ORTH: "La.", NORM: "Louisiana"},
    {ORTH: "Mar.", NORM: "March"},
    {ORTH: "Mass.", NORM: "Massachusetts"},
-    {ORTH: "May.", NORM: "May"},
    {ORTH: "Mich.", NORM: "Michigan"},
    {ORTH: "Minn.", NORM: "Minnesota"},
    {ORTH: "Miss.", NORM: "Mississippi"},
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@ -9,14 +9,14 @@ Example sentences to test spaCy and its language models.
 sentences = [
    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
-    "San Francisco analiza prohibir los robots delivery.",
+    "San Francisco analiza prohibir los robots de reparto.",
    "Londres es una gran ciudad del Reino Unido.",
    "El gato come pescado.",
    "Veo al hombre con el telescopio.",
    "La araña come moscas.",
    "El pingüino incuba en su nido sobre el hielo.",
-    "¿Dónde estais?",
-    "¿Quién es el presidente Francés?",
-    "¿Dónde está encuentra la capital de Argentina?",
+    "¿Dónde estáis?",
+    "¿Quién es el presidente francés?",
+    "¿Dónde se encuentra la capital de Argentina?",
    "¿Cuándo nació José de San Martín?",
 ]
--- a/spacy/lang/es/stop_words.py
+++ b/spacy/lang/es/stop_words.py
@ -1,82 +1,80 @@
 STOP_WORDS = set(
    """
-actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
-al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
-antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
-aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
-aseguró asi así atras aun aunque ayer añadió aún
+a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna
+algunas alguno algunos algún alli allí alrededor ambos ante anterior antes
+apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél
+aquélla aquéllas aquéllos aquí arriba aseguró asi así atras aun aunque añadió
+aún

 bajo bastante bien breve buen buena buenas bueno buenos

-cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
-conmigo conocer conseguimos conseguir considera consideró consigo consigue
-consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
-cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
-cuánto cuántos cómo
+cada casi cierta ciertas cierto ciertos cinco claro comentó como con conmigo
+conocer conseguimos conseguir considera consideró consigo consigue consiguen
+consigues contigo contra creo cual cuales cualquier cuando cuanta cuantas
+cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas cuánto cuántos
+cómo

 da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
 demás dentro deprisa desde despacio despues después detras detrás dia dias dice
-dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
-días dónde
+dicen dicho dieron diez diferente diferentes dijeron dijo dio doce donde dos
+durante día días dónde

-ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
-empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
-eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
-estamos estan estar estará estas este esto estos estoy estuvo está están ex
-excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
-éstos
+e el ella ellas ello ellos embargo en encima encuentra enfrente enseguida
+entonces entre era eramos eran eras eres es esa esas ese eso esos esta estaba
+estaban estado estados estais estamos estan estar estará estas este esto estos
+estoy estuvo está están excepto existe existen explicó expresó él ésa ésas ése
+ésos ésta éstas éste éstos

 fin final fue fuera fueron fui fuimos

-general gran grandes gueno
+gran grande grandes

 ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
 hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
-hizo horas hoy hubo
+hizo hoy hubo

-igual incluso indicó informo informó intenta intentais intentamos intentan
-intentar intentas intento ir
+igual incluso indicó informo informó ir

 junto

-la lado largo las le lejos les llegó lleva llevar lo los luego lugar
+la lado largo las le les llegó lleva llevar lo los luego

 mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
-mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
-muchas mucho muchos muy más mí mía mías mío míos
+mia mias mientras mio mios mis misma mismas mismo mismos modo mucha muchas
+mucho muchos muy más mí mía mías mío míos

 nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
-nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
+nuestra nuestras nuestro nuestros nueva nuevas nueve nuevo nuevos nunca

-ocho os otra otras otro otros
+o ocho once os otra otras otro otros

-pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
-poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
+para parece parte partir pasada pasado paìs peor pero pesar poca pocas poco
+pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
 podrán podría podrían poner por porque posible primer primera primero primeros
-principalmente pronto propia propias propio propios proximo próximo próximos
-pudo pueda puede pueden puedo pues
+pronto propia propias propio propios proximo próximo próximos pudo pueda puede
+pueden puedo pues

-qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
+qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién
+quiénes qué

-raras realizado realizar realizó repente respecto
+realizado realizar realizó repente respecto

 sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
 según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
-siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
-soyos su supuesto sus suya suyas suyo sé sí sólo
+siguiente sin sino sobre sois sola solamente solas solo solos somos son soy su
+supuesto sus suya suyas suyo suyos sé sí sólo

 tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
-tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
-todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
-trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
-tuyos tú
+tenemos tener tenga tengo tenido tenía tercera tercero ti tiene tienen toda
+todas todavia todavía todo todos total tras trata través tres tu tus tuvo tuya
+tuyas tuyo tuyos tú

-ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
+u ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
 última últimas último últimos

-va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
-vez vosotras vosotros voy vuestra vuestras vuestro vuestros
+va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez
+vosotras vosotros voy vuestra vuestras vuestro vuestros

-ya yo
+y ya yo
 """.split()
 )
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
        rules = rules_table.get(univ_pos, [])
        string = string.lower()
        forms = []
+        # first try lookup in table based on upos
        if string in index:
            forms.append(string)
            self.cache[cache_key] = forms
            return forms
+
+        # then add anything in the exceptions table
        forms.extend(exceptions.get(string, []))
+
+        # if nothing found yet, use the rules
        oov_forms = []
        if not forms:
            for old, new in rules:
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
                        forms.append(form)
                    else:
                        oov_forms.append(form)
+
+        # if still nothing, add the oov forms from rules
        if not forms:
            forms.extend(oov_forms)
-        if not forms and string in lookup_table.keys():
-            forms.append(self.lookup_lemmatize(token)[0])
+
+        # use lookups, which fall back to the token itself
        if not forms:
-            forms.append(string)
+            forms.append(lookup_table.get(string, [string])[0])
        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM

 _num_words = set(
    """
-zero un deux trois quatre cinq six sept huit neuf dix
+zero un une deux trois quatre cinq six sept huit neuf dix
 onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
 vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
 cent mille mil million milliard billion quadrillion quintillion
@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion

 _ordinal_words = set(
    """
-premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
+premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
 onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
 vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
 centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
--- a/spacy/lang/grc/init.py
+++ b/spacy/lang/grc/init.py
@ -1,11 +1,15 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults


 class AncientGreekDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS

--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@ -0,0 +1,46 @@
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
+from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from ..char_classes import CONCAT_QUOTES
+
+_prefixes = (
+    [
+        "†",
+        "⸏",
+    ]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)
+
+_suffixes = (
+    LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        "†",
+        "⸎",
+        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
+    ]
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
+    ]
+)
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/hsb/init.py
+++ b/spacy/lang/hsb/init.py
@ -0,0 +1,18 @@
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import Language, BaseDefaults
+
+
+class UpperSorbianDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+
+
+class UpperSorbian(Language):
+    lang = "hsb"
+    Defaults = UpperSorbianDefaults
+
+
+__all__ = ["UpperSorbian"]
--- a/spacy/lang/hsb/examples.py
+++ b/spacy/lang/hsb/examples.py
@ -0,0 +1,15 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.hsb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
+    "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
+    "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
+    "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
+    "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
+]
--- a/spacy/lang/hsb/lex_attrs.py
+++ b/spacy/lang/hsb/lex_attrs.py
@ -0,0 +1,106 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "nul",
+    "jedyn",
+    "jedna",
+    "jedne",
+    "dwaj",
+    "dwě",
+    "tři",
+    "třo",
+    "štyri",
+    "štyrjo",
+    "pjeć",
+    "šěsć",
+    "sydom",
+    "wosom",
+    "dźewjeć",
+    "dźesać",
+    "jědnaće",
+    "dwanaće",
+    "třinaće",
+    "štyrnaće",
+    "pjatnaće",
+    "šěsnaće",
+    "sydomnaće",
+    "wosomnaće",
+    "dźewjatnaće",
+    "dwaceći",
+    "třiceći",
+    "štyrceći",
+    "pjećdźesat",
+    "šěsćdźesat",
+    "sydomdźesat",
+    "wosomdźesat",
+    "dźewjećdźesat",
+    "sto",
+    "tysac",
+    "milion",
+    "miliarda",
+    "bilion",
+    "biliarda",
+    "trilion",
+    "triliarda",
+]
+
+_ordinal_words = [
+    "prěni",
+    "prěnja",
+    "prěnje",
+    "druhi",
+    "druha",
+    "druhe",
+    "třeći",
+    "třeća",
+    "třeće",
+    "štwórty",
+    "štwórta",
+    "štwórte",
+    "pjaty",
+    "pjata",
+    "pjate",
+    "šěsty",
+    "šěsta",
+    "šěste",
+    "sydmy",
+    "sydma",
+    "sydme",
+    "wosmy",
+    "wosma",
+    "wosme",
+    "dźewjaty",
+    "dźewjata",
+    "dźewjate",
+    "dźesaty",
+    "dźesata",
+    "dźesate",
+    "jědnaty",
+    "jědnata",
+    "jědnate",
+    "dwanaty",
+    "dwanata",
+    "dwanate",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/hsb/stop_words.py
+++ b/spacy/lang/hsb/stop_words.py
@ -0,0 +1,19 @@
+STOP_WORDS = set(
+    """
+a abo ale ani
+
+dokelž
+
+hdyž
+
+jeli jelizo
+
+kaž
+
+pak potom
+
+tež tohodla
+
+zo zoby
+""".split()
+)
--- a/spacy/lang/hsb/tokenizer_exceptions.py
+++ b/spacy/lang/hsb/tokenizer_exceptions.py
@ -0,0 +1,18 @@
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...symbols import ORTH, NORM
+from ...util import update_exc
+
+_exc = dict()
+for exc_data in [
+    {ORTH: "mil.", NORM: "milion"},
+    {ORTH: "wob.", NORM: "wobydler"},
+]:
+    _exc[exc_data[ORTH]] = [exc_data]
+
+for orth in [
+    "resp.",
+]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...scorer import Scorer
-from ...symbols import POS
+from ...symbols import POS, X
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
-            token.pos = TAG_MAP[token.tag_][POS]
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc
--- a/spacy/lang/ko/punctuation.py
+++ b/spacy/lang/ko/punctuation.py
@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES


 _infixes = (
-    ["·", "ㆍ", "\(", "\)"]
+    ["·", "ㆍ", r"\(", r"\)"]
    + [r"(?<=[0-9])~(?=[0-9-])"]
    + LIST_QUOTES
    + BASE_TOKENIZER_INFIXES
--- a/spacy/lang/la/init.py
+++ b/spacy/lang/la/init.py
@ -0,0 +1,18 @@
+from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+
+
+class LatinDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS
+
+
+class Latin(Language):
+    lang = "la"
+    Defaults = LatinDefaults
+
+
+__all__ = ["Latin"]
--- a/spacy/lang/la/lex_attrs.py
+++ b/spacy/lang/la/lex_attrs.py
@ -0,0 +1,34 @@
+from ...attrs import LIKE_NUM
+import re
+
+# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
+roman_numerals_compile = re.compile(
+    r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
+)
+
+_num_words = set(
+    """
+unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
+""".split()
+)
+
+_ordinal_words = set(
+    """
+primus prima primum secundus secunda secundum tertius tertia tertium
+""".split()
+)
+
+
+def like_num(text):
+    if text.isdigit():
+        return True
+    if roman_numerals_compile.match(text):
+        return True
+    if text.lower() in _num_words:
+        return True
+    if text.lower() in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/la/stop_words.py
+++ b/spacy/lang/la/stop_words.py
@ -0,0 +1,37 @@
+# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
+
+STOP_WORDS = set(
+    """
+ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem 
+
+cum cur 
+
+de deinde dum 
+
+ego enim ergo es est et etiam etsi ex 
+
+fio 
+
+haud hic 
+
+iam idem igitur ille in infra inter interim ipse is ita 
+
+magis modo mox 
+
+nam ne nec necque neque nisi non nos 
+
+o ob 
+
+per possum post pro 
+
+quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam 
+
+sed si sic sive sub sui sum super suus 
+
+tam tamen trans tu tum 
+
+ubi uel uero
+
+vel vero
+""".split()
+)
--- a/spacy/lang/la/tokenizer_exceptions.py
+++ b/spacy/lang/la/tokenizer_exceptions.py
@ -0,0 +1,76 @@
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...symbols import ORTH
+from ...util import update_exc
+
+
+## TODO: Look into systematically handling u/v
+_exc = {
+    "mecum": [{ORTH: "me"}, {ORTH: "cum"}],
+    "tecum": [{ORTH: "te"}, {ORTH: "cum"}],
+    "nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
+    "vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
+    "uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
+}
+
+for orth in [
+    "A.",
+    "Agr.",
+    "Ap.",
+    "C.",
+    "Cn.",
+    "D.",
+    "F.",
+    "K.",
+    "L.",
+    "M'.",
+    "M.",
+    "Mam.",
+    "N.",
+    "Oct.",
+    "Opet.",
+    "P.",
+    "Paul.",
+    "Post.",
+    "Pro.",
+    "Q.",
+    "S.",
+    "Ser.",
+    "Sert.",
+    "Sex.",
+    "St.",
+    "Sta.",
+    "T.",
+    "Ti.",
+    "V.",
+    "Vol.",
+    "Vop.",
+    "U.",
+    "Uol.",
+    "Uop.",
+    "Ian.",
+    "Febr.",
+    "Mart.",
+    "Apr.",
+    "Mai.",
+    "Iun.",
+    "Iul.",
+    "Aug.",
+    "Sept.",
+    "Oct.",
+    "Nov.",
+    "Nou.",
+    "Dec.",
+    "Non.",
+    "Id.",
+    "A.D.",
+    "Coll.",
+    "Cos.",
+    "Ord.",
+    "Pl.",
+    "S.C.",
+    "Suff.",
+    "Trib.",
+]:
+    _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/lg/init.py
+++ b/spacy/lang/lg/init.py
@ -0,0 +1,18 @@
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from ...language import Language, BaseDefaults
+
+
+class LugandaDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    infixes = TOKENIZER_INFIXES
+    stop_words = STOP_WORDS
+
+
+class Luganda(Language):
+    lang = "lg"
+    Defaults = LugandaDefaults
+
+
+__all__ = ["Luganda"]
--- a/spacy/lang/lg/examples.py
+++ b/spacy/lang/lg/examples.py
@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.lg.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
+    "Okuyita Ttembo kitegeeza kugwa ddalu",
+    "Ekifumu kino kyali kya mulimu ki?",
+    "Ekkovu we liyise wayitibwa mukululo",
+    "Akola mulimu ki oguvaamu ssente?",
+    "Emisumaali egikomerera embaawo giyitibwa nninga",
+    "Abooluganda ab’emmamba ababiri",
+    "Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
+]
--- a/spacy/lang/lg/lex_attrs.py
+++ b/spacy/lang/lg/lex_attrs.py
@ -0,0 +1,95 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "nnooti",  # Zero
+    "zeero",  # zero
+    "emu",  # one
+    "bbiri",  # two
+    "ssatu",  # three
+    "nnya",  # four
+    "ttaano",  # five
+    "mukaaga",  # six
+    "musanvu",  # seven
+    "munaana",  # eight
+    "mwenda",  # nine
+    "kkumi",  # ten
+    "kkumi n'emu",  # eleven
+    "kkumi na bbiri",  # twelve
+    "kkumi na ssatu",  # thirteen
+    "kkumi na nnya",  # forteen
+    "kkumi na ttaano",  # fifteen
+    "kkumi na mukaaga",  # sixteen
+    "kkumi na musanvu",  # seventeen
+    "kkumi na munaana",  # eighteen
+    "kkumi na mwenda",  # nineteen
+    "amakumi abiri",  # twenty
+    "amakumi asatu",  # thirty
+    "amakumi ana",  # forty
+    "amakumi ataano",  # fifty
+    "nkaaga",  # sixty
+    "nsanvu",  # seventy
+    "kinaana",  # eighty
+    "kyenda",  # ninety
+    "kikumi",  # hundred
+    "lukumi",  # thousand
+    "kakadde",  # million
+    "kawumbi",  # billion
+    "kase",  # trillion
+    "katabalika",  # quadrillion
+    "keesedde",  # gajillion
+    "kafukunya",  # bazillion
+    "ekisooka",  # first
+    "ekyokubiri",  # second
+    "ekyokusatu",  # third
+    "ekyokuna",  # fourth
+    "ekyokutaano",  # fifith
+    "ekyomukaaga",  # sixth
+    "ekyomusanvu",  # seventh
+    "eky'omunaana",  # eighth
+    "ekyomwenda",  # nineth
+    "ekyekkumi",  # tenth
+    "ekyekkumi n'ekimu",  # eleventh
+    "ekyekkumi n'ebibiri",  # twelveth
+    "ekyekkumi n'ebisatu",  # thirteenth
+    "ekyekkumi n'ebina",  # fourteenth
+    "ekyekkumi n'ebitaano",  # fifteenth
+    "ekyekkumi n'omukaaga",  # sixteenth
+    "ekyekkumi n'omusanvu",  # seventeenth
+    "ekyekkumi n'omunaana",  # eigteenth
+    "ekyekkumi n'omwenda",  # nineteenth
+    "ekyamakumi abiri",  # twentieth
+    "ekyamakumi asatu",  # thirtieth
+    "ekyamakumi ana",  # fortieth
+    "ekyamakumi ataano",  # fiftieth
+    "ekyenkaaga",  # sixtieth
+    "ekyensanvu",  # seventieth
+    "ekyekinaana",  # eightieth
+    "ekyekyenda",  # ninetieth
+    "ekyekikumi",  # hundredth
+    "ekyolukumi",  # thousandth
+    "ekyakakadde",  # millionth
+    "ekyakawumbi",  # billionth
+    "ekyakase",  # trillionth
+    "ekyakatabalika",  # quadrillionth
+    "ekyakeesedde",  # gajillionth
+    "ekyakafukunya",  # bazillionth
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/lg/punctuation.py
+++ b/spacy/lang/lg/punctuation.py
@ -0,0 +1,19 @@
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/lg/stop_words.py
+++ b/spacy/lang/lg/stop_words.py
@ -0,0 +1,19 @@
+STOP_WORDS = set(
+    """
+abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
+atya awamu aweebwa ayinza ba baali babadde babalina bajja
+bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
+bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
+byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
+endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
+kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
+kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
+lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
+nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
+okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
+oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
+tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
+wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
+ye yenna yennyini yina yonna ziba zijja zonna
+""".split()
+)
--- a/spacy/lang/nl/stop_words.py
+++ b/spacy/lang/nl/stop_words.py
@ -15,7 +15,7 @@

 STOP_WORDS = set(
    """
-aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
+aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
 afgelopen aldus alhoewel anderzijds

 ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
--- a/spacy/lang/nl/syntax_iterators.py
+++ b/spacy/lang/nl/syntax_iterators.py
@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    span_label = doc.vocab.strings.add("NP")

    # Only NOUNS and PRONOUNS matter
+    end_span = -1
    for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
        # For NOUNS
        # Pick children from syntactic parse (only those with certain dependencies)
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
            children_i = [c.i for c in children] + [word.i]

            start_span = min(children_i)
-            end_span = max(children_i) + 1
-            yield start_span, end_span, span_label
+            if start_span >= end_span:
+                end_span = max(children_i) + 1
+                yield start_span, end_span, span_label

        # PRONOUNS only if it is the subject of a verb
        elif word.pos == PRON:
            if word.dep in pronoun_deps:
                start_span = word.i
-                end_span = word.i + 1
-                yield start_span, end_span, span_label
+                if start_span >= end_span:
+                    end_span = word.i + 1
+                    yield start_span, end_span, span_label


 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -1,5 +1,5 @@
 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
+from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
 from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT


@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )
+
+
+# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
+# to mark stressed syllables in words where stress is distinctive. Such languages
+# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
+# place of the standard ones.
+COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
+    r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
+]
+
+COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
+    r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
+        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
+    ),
+    r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
+    r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
+        a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
+    ),
+    r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
+]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 from ...language import Language, BaseDefaults


@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
+    suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
+    infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES


 class Russian(Language):
@ -24,7 +28,7 @@ class Russian(Language):
    assigns=["token.lemma"],
    default_config={
        "model": None,
-        "mode": "pymorphy2",
+        "mode": "pymorphy3",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -19,33 +19,48 @@ class RussianLemmatizer(Lemmatizer):
        model: Optional[Model],
        name: str = "lemmatizer",
        *,
-        mode: str = "pymorphy2",
+        mode: str = "pymorphy3",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
            try:
                from pymorphy2 import MorphAnalyzer
            except ImportError:
                raise ImportError(
-                    "The Russian lemmatizer mode 'pymorphy2' requires the "
-                    "pymorphy2 library. Install it with: pip install pymorphy2"
+                    "The lemmatizer mode 'pymorphy2' requires the "
+                    "pymorphy2 library and dictionaries. Install them with: "
+                    "pip install pymorphy2"
+                    "# for Ukrainian dictionaries:"
+                    "pip install pymorphy2-dicts-uk"
                ) from None
            if getattr(self, "_morph", None) is None:
-                self._morph = MorphAnalyzer()
+                self._morph = MorphAnalyzer(lang="ru")
+        elif mode in {"pymorphy3", "pymorphy3_lookup"}:
+            try:
+                from pymorphy3 import MorphAnalyzer
+            except ImportError:
+                raise ImportError(
+                    "The lemmatizer mode 'pymorphy3' requires the "
+                    "pymorphy3 library and dictionaries. Install them with: "
+                    "pip install pymorphy3"
+                    "# for Ukrainian dictionaries:"
+                    "pip install pymorphy3-dicts-uk"
+                ) from None
+            if getattr(self, "_morph", None) is None:
+                self._morph = MorphAnalyzer(lang="ru")
        super().__init__(
            vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
        )

-    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
+    def _pymorphy_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        univ_pos = token.pos_
        morphology = token.morph.to_dict()
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
-            # Skip unchangeable pos
-            return [string.lower()]
+            return self._pymorphy_lookup_lemmatize(token)
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
@ -53,8 +68,10 @@ class RussianLemmatizer(Lemmatizer):
                # Skip suggested parse variant for unknown word for pymorphy
                continue
            analysis_pos, _ = oc2ud(str(analysis.tag))
-            if analysis_pos == univ_pos or (
-                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
+            if (
+                analysis_pos == univ_pos
+                or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN"))
+                or ((analysis_pos == "PRON") and (univ_pos == "DET"))
            ):
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
@ -97,13 +114,28 @@ class RussianLemmatizer(Lemmatizer):
            dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
        )

-    def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
+    def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        analyses = self._morph.parse(string)
-        if len(analyses) == 1:
-            return [analyses[0].normal_form]
+        # often multiple forms would derive from the same normal form
+        # thus check _unique_ normal forms
+        normal_forms = set([an.normal_form for an in analyses])
+        if len(normal_forms) == 1:
+            return [next(iter(normal_forms))]
        return [string]

+    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
+        return self._pymorphy_lemmatize(token)
+
+    def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
+        return self._pymorphy_lookup_lemmatize(token)
+
+    def pymorphy3_lemmatize(self, token: Token) -> List[str]:
+        return self._pymorphy_lemmatize(token)
+
+    def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]:
+        return self._pymorphy_lookup_lemmatize(token)
+

 def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
    gram_map = {
--- a/Show More
+++ b/Show More