Merge branch 'master' into feature/coref

2025-11-19 01:05:56 +03:00 · 2021-05-15 20:05:17 +09:00 · 2021-05-15 20:05:17 +09:00 · 3608b7b3f9
commit 3608b7b3f9
parent e0c45c669a 2dc6db53fd
173 changed files with 5671 additions and 1643 deletions
--- a/.github/ISSUE_TEMPLATE/02_docs.md
+++ b/.github/ISSUE_TEMPLATE/02_docs.md
--- a/.github/ISSUE_TEMPLATE/02_install.md
+++ b/.github/ISSUE_TEMPLATE/02_install.md
@ -1,21 +0,0 @@
---
-name: "\U000023F3 Installation Problem"
-about: Do you have problems installing spaCy, and none of the suggestions in the docs
-  and other issues helped?
-
---
-<!-- Before submitting an issue, make sure to check the docs and closed issues to see if any of the solutions work for you. Installation problems can often be related to Python environment issues and problems with compilation. -->
-
-## How to reproduce the problem
-<!-- Include the details of how the problem occurred. Which command did you run to install spaCy? Did you come across an error? What else did you try? -->
-
-```bash
-# copy-paste the error message here
-```
-
-## Your Environment
-<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
-* Operating System:
-* Python Version Used:
-* spaCy Version Used:
-* Environment Information:
--- a/.github/ISSUE_TEMPLATE/03_other.md
+++ b/.github/ISSUE_TEMPLATE/03_other.md
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -0,0 +1,57 @@
+parameters:
+  python_version: ''
+  architecture: ''
+  prefix: ''
+  gpu: false
+  num_build_jobs: 1
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: ${{ parameters.python_version }}
+      architecture: ${{ parameters.architecture }}
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install dependencies"
+
+  - script: |
+      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Compile and build sdist"
+
+  - task: DeleteFiles@1
+    inputs:
+      contents: "spacy"
+    displayName: "Delete source directory"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+    displayName: "Uninstall all packages"
+
+  - bash: |
+      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+    displayName: "Install from sdist"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
+      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
+    displayName: "Install GPU requirements"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy
+    displayName: "Run CPU tests"
+    condition: eq(${{ parameters.gpu }}, false)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
+    displayName: "Run GPU tests"
+    condition: eq(${{ parameters.gpu }}, true)
--- a/.github/contributors/AyushExel.md
+++ b/.github/contributors/AyushExel.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Ayush Chaurasia      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-03-12           |
+| GitHub username                | AyushExel            |
+| Website (optional)             |                      |
--- a/.github/contributors/SamEdwardes.md
+++ b/.github/contributors/SamEdwardes.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sam Edwardes         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-04-02           |
+| GitHub username                | SamEdwardes          |
+| Website (optional)             | samedwardes.com      |
--- a/.github/contributors/armsp.md
+++ b/.github/contributors/armsp.md
@ -98,9 +98,9 @@ mark both statements:

 | Field                          | Entry                |
 |------------------------------- | -------------------- |
-| Name                           |  Shantam             |
+| Name                           |  Shantam Raj         |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
-| Date                           |   21/5/2018          |
+| Date                           |   10/4/2021          |
 | GitHub username                |     armsp            |
-| Website (optional)             |                      |
+| Website (optional)             |https://shantamraj.com|
--- a/.github/contributors/broaddeep.md
+++ b/.github/contributors/broaddeep.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Dongjun Park         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-03-06           |
+| GitHub username                | broaddeep            |
+| Website (optional)             |                      |
--- a/.github/contributors/bsweileh.md
+++ b/.github/contributors/bsweileh.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |  Belal               |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |  March 13, 2021      |
+| GitHub username                |  bsweileh            |
+| Website (optional)             |                      |
--- a/.github/contributors/jankrepl.md
+++ b/.github/contributors/jankrepl.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jan Krepl            |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-03-09           |
+| GitHub username                | jankrepl             |
+| Website (optional)             |                      |
--- a/.github/contributors/jklaise.md
+++ b/.github/contributors/jklaise.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |Janis Klaise          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |26/04/2021            |
+| GitHub username                |jklaise               |
+| Website (optional)             |janisklaise.com       |
--- a/.github/contributors/meghanabhange.md
+++ b/.github/contributors/meghanabhange.md
@ -0,0 +1,107 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Meghana Bhange            |
+| Company name (if applicable)   | Verloop.io                 |
+| Title or role (if applicable)  | ML Engineer        |
+| Date                           | 2020-04-21               |
+| GitHub username                | meghanbhange                  |
+| Website (optional)             | https://meghana.blog |
+
--- a/.github/contributors/plison.md
+++ b/.github/contributors/plison.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Pierre Lison         |
+| Company name (if applicable)   | Norsk Regnesentral   |
+| Title or role (if applicable)  | Senior Researcher    |
+| Date                           | 22.04.2021           |
+| GitHub username                | plison               |
+| Website (optional)             | www.nr.no/~plison    |
--- a/.github/contributors/sevdimali.md
+++ b/.github/contributors/sevdimali.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sevdimali            |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 10/4/2021            |
+| GitHub username                | sevdimali            |
+| Website (optional)             | https://sevdimali.me |
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -3,6 +3,7 @@ recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
 include LICENSE
 include README.md
 include pyproject.toml
+include spacy/py.typed
 recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -76,39 +76,24 @@ jobs:
      maxParallel: 4
    pool:
      vmImage: $(imageName)
-
    steps:
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: "$(python.version)"
-          architecture: "x64"
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'
+          architecture: 'x64'

-      - script: |
-          python -m pip install -U setuptools
-          pip install -r requirements.txt
-        displayName: "Install dependencies"
-
-      - script: |
-          python setup.py build_ext --inplace
-          python setup.py sdist --formats=gztar
-        displayName: "Compile and build sdist"
-
-      - task: DeleteFiles@1
-        inputs:
-          contents: "spacy"
-        displayName: "Delete source directory"
-
-      - script: |
-          pip freeze > installed.txt
-          pip uninstall -y -r installed.txt
-        displayName: "Uninstall all packages"
-
-      - bash: |
-          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-          pip install dist/$SDIST
-        displayName: "Install from sdist"
-
-      - script: |
-          pip install -r requirements.txt
-          python -m pytest --pyargs spacy
-        displayName: "Run tests"
+  - job: "TestGPU"
+    dependsOn: "Validate"
+    strategy:
+      matrix:
+        Python38LinuxX64_GPU:
+          python.version: '3.8'
+    pool:
+      name: "LinuxX64_GPU"
+    steps:
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'
+          architecture: 'x64'
+          gpu: true
+          num_build_jobs: 24
--- a/examples/README.md
+++ b/examples/README.md
@ -0,0 +1,130 @@
+<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
+
+# spaCy examples
+
+For spaCy v3 we've converted many of the [v2 example
+scripts](https://github.com/explosion/spaCy/tree/v2.3.x/examples/) into
+end-to-end [spacy projects](https://spacy.io/usage/projects) workflows. The
+workflows include all the steps to go from data to packaged spaCy models.
+
+## 🪐 Pipeline component demos
+
+The simplest demos for training a single pipeline component are in the
+[`pipelines`](https://github.com/explosion/projects/blob/v3/pipelines) category
+including:
+
+- [`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo):
+  Train a named entity recognizer
+- [`pipelines/textcat_demo`](https://github.com/explosion/projects/blob/v3/pipelines/textcat_demo):
+  Train a text classifier
+- [`pipelines/parser_intent_demo`](https://github.com/explosion/projects/blob/v3/pipelines/parser_intent_demo):
+  Train a dependency parser for custom semantics
+
+## 🪐 Tutorials
+
+The [`tutorials`](https://github.com/explosion/projects/blob/v3/tutorials)
+category includes examples that work through specific NLP use cases end-to-end:
+
+- [`tutorials/textcat_goemotions`](https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions):
+  Train a text classifier to categorize emotions in Reddit posts
+- [`tutorials/nel_emerson`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson):
+  Use an entity linker to disambiguate mentions of the same name
+
+Check out the [projects documentation](https://spacy.io/usage/projects) and
+browse through the [available
+projects](https://github.com/explosion/projects/)!
+
+## 🚀 Get started with a demo project
+
+The
+[`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo)
+project converts the spaCy v2
+[`train_ner.py`](https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/train_ner.py)
+demo script into a spaCy v3 project.
+
+1. Clone the project:
+
+   ```bash
+   python -m spacy project clone pipelines/ner_demo
+   ```
+
+2. Install requirements and download any data assets:
+
+   ```bash
+   cd ner_demo
+   python -m pip install -r requirements.txt
+   python -m spacy project assets
+   ```
+
+3. Run the default workflow to convert, train and evaluate:
+
+   ```bash
+   python -m spacy project run all
+   ```
+
+   Sample output:
+
+   ```none
+   ℹ Running workflow 'all'
+   
+   ================================== convert ==================================
+   Running command: /home/user/venv/bin/python scripts/convert.py en assets/train.json corpus/train.spacy
+   Running command: /home/user/venv/bin/python scripts/convert.py en assets/dev.json corpus/dev.spacy
+   
+   =============================== create-config ===============================
+   Running command: /home/user/venv/bin/python -m spacy init config --lang en --pipeline ner configs/config.cfg --force
+   ℹ Generated config template specific for your use case
+   - Language: en
+   - Pipeline: ner
+   - Optimize for: efficiency
+   - Hardware: CPU
+   - Transformer: None
+   ✔ Auto-filled config with all values
+   ✔ Saved config
+   configs/config.cfg
+   You can now add your data and train your pipeline:
+   python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
+   
+   =================================== train ===================================
+   Running command: /home/user/venv/bin/python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.max_steps 100 --gpu-id -1
+   ℹ Using CPU
+   
+   =========================== Initializing pipeline ===========================
+   [2021-03-11 19:34:59,101] [INFO] Set up nlp object from config
+   [2021-03-11 19:34:59,109] [INFO] Pipeline: ['tok2vec', 'ner']
+   [2021-03-11 19:34:59,113] [INFO] Created vocabulary
+   [2021-03-11 19:34:59,113] [INFO] Finished initializing nlp object
+   [2021-03-11 19:34:59,265] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
+   ✔ Initialized pipeline
+   
+   ============================= Training pipeline =============================
+   ℹ Pipeline: ['tok2vec', 'ner']
+   ℹ Initial learn rate: 0.001
+   E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
+   ---  ------  ------------  --------  ------  ------  ------  ------
+     0       0          0.00      7.90    0.00    0.00    0.00    0.00
+    10      10          0.11     71.07    0.00    0.00    0.00    0.00
+    20      20          0.65     22.44   50.00   50.00   50.00    0.50
+    30      30          0.22      6.38   80.00   66.67  100.00    0.80
+    40      40          0.00      0.00   80.00   66.67  100.00    0.80
+    50      50          0.00      0.00   80.00   66.67  100.00    0.80
+    60      60          0.00      0.00  100.00  100.00  100.00    1.00
+    70      70          0.00      0.00  100.00  100.00  100.00    1.00
+    80      80          0.00      0.00  100.00  100.00  100.00    1.00
+    90      90          0.00      0.00  100.00  100.00  100.00    1.00
+   100     100          0.00      0.00  100.00  100.00  100.00    1.00
+   ✔ Saved pipeline to output directory
+   training/model-last
+   ```
+
+4. Package the model:
+
+   ```bash
+   python -m spacy project run package
+   ```
+
+5. Visualize the model's output with [Streamlit](https://streamlit.io):
+
+   ```bash
+   python -m spacy project run visualize-model
+   ```
--- a/examples/training/README.md
+++ b/examples/training/README.md
@ -0,0 +1,5 @@
+<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
+
+# spaCy examples
+
+See [examples/README.md](../README.md)
--- a/extra/example_data/ner_example_data/README.md
+++ b/extra/example_data/ner_example_data/README.md
@ -1,7 +1,25 @@
 ## Examples of NER/IOB data that can be converted with `spacy convert`

-spacy JSON training files were generated with:
+To convert an IOB file to `.spacy` ([`DocBin`](https://spacy.io/api/docbin))
+for spaCy v3:

+```bash
+python -m spacy convert -c iob -s -n 10 -b en_core_web_sm file.iob .
 ```
+
+See all the `spacy convert` options: https://spacy.io/api/cli#convert
+
+---
+
+The spaCy v2 JSON training files were generated using **spaCy v2** with:
+
+```bash
 python -m spacy convert -c iob -s -n 10 -b en file.iob
 ```
+
+To convert an existing JSON training file to `.spacy` for spaCy v3, convert
+with **spaCy v3**:
+
+```bash
+python -m spacy convert file.json .
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0,<8.1.0",
+    "thinc>=8.0.3,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy",
    "numpy>=1.15.0",
--- a/requirements.txt
+++ b/requirements.txt
@ -1,16 +1,16 @@
 # Our libraries
-spacy-legacy>=3.0.0,<3.1.0
+spacy-legacy>=3.0.5,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0,<8.1.0
+thinc>=8.0.3,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.1,<1.1.0
-srsly>=2.4.0,<3.0.0
-catalogue>=2.0.1,<2.1.0
+srsly>=2.4.1,<3.0.0
+catalogue>=2.0.4,<2.1.0
 typer>=0.3.0,<0.4.0
-pathy
+pathy>=0.3.5
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
@ -20,12 +20,11 @@ jinja2
 # Official Python utilities
 setuptools
 packaging>=20.0
-importlib_metadata>=0.20; python_version < "3.8"
-typing_extensions>=3.7.4; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=5.2.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-hypothesis
+hypothesis>=3.27.0,<7.0.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,20 +34,20 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0,<8.1.0
+    thinc>=8.0.3,<8.1.0
 install_requires =
    # Our libraries
-    spacy-legacy>=3.0.0,<3.1.0
+    spacy-legacy>=3.0.5,<3.1.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0,<8.1.0
+    thinc>=8.0.3,<8.1.0
    blis>=0.4.0,<0.8.0
    wasabi>=0.8.1,<1.1.0
-    srsly>=2.4.0,<3.0.0
-    catalogue>=2.0.1,<2.1.0
+    srsly>=2.4.1,<3.0.0
+    catalogue>=2.0.4,<2.1.0
    typer>=0.3.0,<0.4.0
-    pathy
+    pathy>=0.3.5
    # Third-party dependencies
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
@ -57,8 +57,7 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    importlib_metadata>=0.20; python_version < "3.8"
-    typing_extensions>=3.7.4; python_version < "3.8"
+    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"

 [options.entry_points]
 console_scripts =
@ -72,25 +71,27 @@ transformers =
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<9.0.0
+    cupy>=5.0.0b4,<10.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<9.0.0
+    cupy-cuda80>=5.0.0b4,<10.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<9.0.0
+    cupy-cuda90>=5.0.0b4,<10.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<9.0.0
+    cupy-cuda91>=5.0.0b4,<10.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<9.0.0
+    cupy-cuda92>=5.0.0b4,<10.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<9.0.0
+    cupy-cuda100>=5.0.0b4,<10.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<9.0.0
+    cupy-cuda101>=5.0.0b4,<10.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<9.0.0
+    cupy-cuda102>=5.0.0b4,<10.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<9.0.0
+    cupy-cuda110>=5.0.0b4,<10.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<9.0.0
+    cupy-cuda111>=5.0.0b4,<10.0.0
+cuda112 =
+    cupy-cuda112>=5.0.0b4,<10.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.4.9
--- a/spacy/init.py
+++ b/spacy/init.py
@ -28,6 +28,8 @@ if sys.maxunicode == 65535:

 def load(
    name: Union[str, Path],
+    *,
+    vocab: Union[Vocab, bool] = True,
    disable: Iterable[str] = util.SimpleFrozenList(),
    exclude: Iterable[str] = util.SimpleFrozenList(),
    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
@ -35,6 +37,7 @@ def load(
    """Load a spaCy model from an installed package or a local path.

    name (str): Package name or model path.
+    vocab (Vocab): A Vocab object. If True, a vocab is created.
    disable (Iterable[str]): Names of pipeline components to disable. Disabled
        pipes will be loaded but they won't be run unless you explicitly
        enable them by calling nlp.enable_pipe.
@ -44,7 +47,9 @@ def load(
        keyed by section values in dot notation.
    RETURNS (Language): The loaded nlp object.
    """
-    return util.load_model(name, disable=disable, exclude=exclude, config=config)
+    return util.load_model(
+        name, vocab=vocab, disable=disable, exclude=exclude, config=config
+    )


 def blank(
@ -52,7 +57,7 @@ def blank(
    *,
    vocab: Union[Vocab, bool] = True,
    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
-    meta: Dict[str, Any] = util.SimpleFrozenDict()
+    meta: Dict[str, Any] = util.SimpleFrozenDict(),
 ) -> Language:
    """Create a blank nlp object for a given language code.

--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.3"
+__version__ = "3.0.6"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -9,6 +9,7 @@ from .info import info  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
+from .assemble import assemble_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
@ -29,9 +30,9 @@ from .project.document import project_document  # noqa: F401

@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 def link(*args, **kwargs):
-    """As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
+    """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained
    pipeline packages using their full names or from a directory path."""
    msg.warn(
-        "As of spaCy v3.0, model symlinks are deprecated. You can load trained "
+        "As of spaCy v3.0, model symlinks are not supported anymore. You can load trained "
        "pipeline packages using their full names or from a directory path."
    )
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -11,6 +11,7 @@ from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
 from thinc.api import Config, ConfigValidationError, require_gpu
+from thinc.util import has_cupy, gpu_is_available
 from configparser import InterpolationError
 import os

@ -510,3 +511,5 @@ def setup_gpu(use_gpu: int) -> None:
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
+        if has_cupy and gpu_is_available():
+            msg.info("To switch to GPU 0, use the option: --gpu-id 0")
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@ -0,0 +1,58 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import typer
+import logging
+
+from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code
+from ..training.initialize import init_nlp
+from .. import util
+from ..util import get_sourced_components, load_model_from_config
+
+
+@app.command(
+    "assemble",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def assemble_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    # fmt: on
+):
+    """
+    Assemble a spaCy pipeline from a config file. The config file includes
+    all settings for initializing the pipeline. To override settings in the
+    config, e.g. settings that point to local paths or that you want to
+    experiment with, you can override them as command line options. The
+    --code argument lets you pass in a Python file that can be used to
+    register custom functions that are referenced in the config.
+
+    DOCS: https://spacy.io/api/cli#assemble
+    """
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    # Make sure all files and paths exists if they are needed
+    if not config_path or (str(config_path) != "-" and not config_path.exists()):
+        msg.fail("Config file not found", config_path, exits=1)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    msg.divider("Initializing pipeline")
+    nlp = load_model_from_config(config, auto_fill=True)
+    config = config.interpolate()
+    sourced = get_sourced_components(config)
+    # Make sure that listeners are defined before initializing further
+    nlp._link_components()
+    with nlp.select_pipes(disable=[*sourced]):
+        nlp.initialize()
+    msg.good("Initialized pipeline")
+    msg.divider("Serializing to disk")
+    if output_path is not None and not output_path.exists():
+        output_path.mkdir(parents=True)
+        msg.good(f"Created output directory: {output_path}")
+    nlp.to_disk(output_path)
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -20,7 +20,7 @@ def debug_config_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
    # fmt: on
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -1,4 +1,4 @@
-from typing import List, Sequence, Dict, Any, Tuple, Optional
+from typing import List, Sequence, Dict, Any, Tuple, Optional, Set
 from pathlib import Path
 from collections import Counter
 import sys
@ -13,6 +13,8 @@ from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
+from ..pipeline import Morphologizer
+from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
 from .. import util
@ -39,7 +41,7 @@ def debug_data_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
    no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@ -171,8 +173,8 @@ def debug_data(
        )
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
-            "{} words in training data without vectors ({:0.2f}%)".format(
-                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
+            "{} words in training data without vectors ({:.0f}%)".format(
+                n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"])
            ),
        )
        msg.text(
@ -194,32 +196,32 @@ def debug_data(
        )
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
-        new_labels = [l for l in labels if l not in model_labels]
-        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_punct_ents_warning = False

        msg.divider("Named Entity Recognition")
-        msg.info(
-            f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
-        )
+        msg.info(f"{len(model_labels)} label(s)")
        missing_values = label_counts["-"]
        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
-        for label in new_labels:
+        for label in labels:
            if len(label) == 0:
-                msg.fail("Empty label found in new labels")
-        if new_labels:
-            labels_with_counts = [
-                (label, count)
-                for label, count in label_counts.most_common()
-                if label != "-"
-            ]
-            labels_with_counts = _format_labels(labels_with_counts, counts=True)
-            msg.text(f"New: {labels_with_counts}", show=verbose)
-        if existing_labels:
-            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
+                msg.fail("Empty label found in train data")
+        labels_with_counts = [
+            (label, count)
+            for label, count in label_counts.most_common()
+            if label != "-"
+        ]
+        labels_with_counts = _format_labels(labels_with_counts, counts=True)
+        msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
+        missing_labels = model_labels - labels
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
        if gold_train_data["ws_ents"]:
            msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
            has_ws_ents_error = True
@ -228,10 +230,10 @@ def debug_data(
            msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
            has_punct_ents_warning = True

-        for label in new_labels:
+        for label in labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
-                    f"Low number of examples for new label '{label}' ({label_counts[label]})"
+                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

@ -276,22 +278,52 @@ def debug_data(
            )

    if "textcat" in factory_names:
-        msg.divider("Text Classification")
-        labels = [label for label in gold_train_data["cats"]]
-        model_labels = _get_labels_from_model(nlp, "textcat")
-        new_labels = [l for l in labels if l not in model_labels]
-        existing_labels = [l for l in labels if l in model_labels]
-        msg.info(
-            f"Text Classification: {len(new_labels)} new label(s), "
-            f"{len(existing_labels)} existing label(s)"
+        msg.divider("Text Classification (Exclusive Classes)")
+        labels = _get_labels_from_model(nlp, "textcat")
+        msg.info(f"Text Classification: {len(labels)} label(s)")
+        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
+        labels_with_counts = _format_labels(
+            gold_train_data["cats"].most_common(), counts=True
        )
-        if new_labels:
-            labels_with_counts = _format_labels(
-                gold_train_data["cats"].most_common(), counts=True
+        msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
+        missing_labels = labels - set(gold_train_data["cats"].keys())
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
+        if gold_train_data["n_cats_multilabel"] > 0:
+            # Note: you should never get here because you run into E895 on
+            # initialization first.
+            msg.warn(
+                "The train data contains instances without "
+                "mutually-exclusive classes. Use the component "
+                "'textcat_multilabel' instead of 'textcat'."
+            )
+        if gold_dev_data["n_cats_multilabel"] > 0:
+            msg.fail(
+                "Train/dev mismatch: the dev data contains instances "
+                "without mutually-exclusive classes while the train data "
+                "contains only instances with mutually-exclusive classes."
+            )
+
+    if "textcat_multilabel" in factory_names:
+        msg.divider("Text Classification (Multilabel)")
+        labels = _get_labels_from_model(nlp, "textcat_multilabel")
+        msg.info(f"Text Classification: {len(labels)} label(s)")
+        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
+        labels_with_counts = _format_labels(
+            gold_train_data["cats"].most_common(), counts=True
+        )
+        msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
+        missing_labels = labels - set(gold_train_data["cats"].keys())
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
            )
-            msg.text(f"New: {labels_with_counts}", show=verbose)
-        if existing_labels:
-            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.fail(
                f"The train and dev labels are not the same. "
@ -299,11 +331,6 @@ def debug_data(
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
-            msg.info(
-                "The train data contains instances without "
-                "mutually-exclusive classes. Use '--textcat-multilabel' "
-                "when training."
-            )
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
@ -311,9 +338,10 @@ def debug_data(
                    "dev data does not."
                )
        else:
-            msg.info(
+            msg.warn(
                "The train data contains only instances with "
-                "mutually-exclusive classes."
+                "mutually-exclusive classes. You can potentially use the "
+                "component 'textcat' instead of 'textcat_multilabel'."
            )
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
@ -325,13 +353,37 @@ def debug_data(
    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
-        # TODO: does this need to be updated?
-        msg.info(f"{len(labels)} label(s) in data")
+        model_labels = _get_labels_from_model(nlp, "tagger")
+        msg.info(f"{len(labels)} label(s) in train data")
+        missing_labels = model_labels - set(labels)
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

+    if "morphologizer" in factory_names:
+        msg.divider("Morphologizer (POS+Morph)")
+        labels = [label for label in gold_train_data["morphs"]]
+        model_labels = _get_labels_from_model(nlp, "morphologizer")
+        msg.info(f"{len(labels)} label(s) in train data")
+        missing_labels = model_labels - set(labels)
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
+        labels_with_counts = _format_labels(
+            gold_train_data["morphs"].most_common(), counts=True
+        )
+        msg.text(labels_with_counts, show=verbose)
+
    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")
@ -491,6 +543,7 @@ def _compile_gold(
        "ner": Counter(),
        "cats": Counter(),
        "tags": Counter(),
+        "morphs": Counter(),
        "deps": Counter(),
        "words": Counter(),
        "roots": Counter(),
@ -544,13 +597,36 @@ def _compile_gold(
                    data["ner"][combined_label] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
-        if "textcat" in factory_names:
+        if "textcat" in factory_names or "textcat_multilabel" in factory_names:
            data["cats"].update(gold.cats)
            if list(gold.cats.values()).count(1.0) != 1:
                data["n_cats_multilabel"] += 1
        if "tagger" in factory_names:
            tags = eg.get_aligned("TAG", as_string=True)
            data["tags"].update([x for x in tags if x is not None])
+        if "morphologizer" in factory_names:
+            pos_tags = eg.get_aligned("POS", as_string=True)
+            morphs = eg.get_aligned("MORPH", as_string=True)
+            for pos, morph in zip(pos_tags, morphs):
+                # POS may align (same value for multiple tokens) when morph
+                # doesn't, so if either is misaligned (None), treat the
+                # annotation as missing so that truths doesn't end up with an
+                # unknown morph+POS combination
+                if pos is None or morph is None:
+                    pass
+                # If both are unset, the annotation is missing (empty morph
+                # converted from int is "_" rather than "")
+                elif pos == "" and morph == "":
+                    pass
+                # Otherwise, generate the combined label
+                else:
+                    label_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        label_dict[Morphologizer.POS_FEAT] = pos
+                    label = eg.reference.vocab.strings[
+                        eg.reference.vocab.morphology.add(label_dict)
+                    ]
+                    data["morphs"].update([label])
        if "parser" in factory_names:
            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
            data["deps"].update([x for x in aligned_deps if x is not None])
@ -584,8 +660,8 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
    return count


-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
+def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
    if pipe_name not in nlp.pipe_names:
        return set()
    pipe = nlp.get_pipe(pipe_name)
-    return pipe.labels
+    return set(pipe.labels)
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -1,5 +1,6 @@
 from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
+import itertools

 from spacy.training import Example
 from spacy.util import resolve_dot_names
@ -73,23 +74,24 @@ def debug_model_cli(
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
    pipe = nlp.get_pipe(component)
-    if not hasattr(pipe, "model"):
-        msg.fail(
-            f"The component '{component}' does not specify an object that holds a Model.",
-            exits=1,
-        )
-    model = pipe.model
-    debug_model(config, T, nlp, model, print_settings=print_settings)
+
+    debug_model(config, T, nlp, pipe, print_settings=print_settings)


 def debug_model(
    config,
    resolved_train_config,
    nlp,
-    model: Model,
+    pipe,
    *,
    print_settings: Optional[Dict[str, Any]] = None,
 ):
+    if not hasattr(pipe, "model"):
+        msg.fail(
+            f"The component '{pipe}' does not specify an object that holds a Model.",
+            exits=1,
+        )
+    model = pipe.model
    if not isinstance(model, Model):
        msg.fail(
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@ -105,8 +107,6 @@ def debug_model(
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
-    X = _get_docs()
-    # The output vector might differ from the official type of the output layer
    with data_validation(False):
        try:
            dot_names = [resolved_train_config["train_corpus"]]
@ -114,15 +114,17 @@ def debug_model(
                (train_corpus,) = resolve_dot_names(config, dot_names)
                nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
+            examples = list(itertools.islice(train_corpus(nlp), 5))
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
                with show_validation_error():
-                    nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
+                    examples = [Example.from_dict(x, {}) for x in _get_docs()]
+                    nlp.initialize(lambda: examples)
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
-                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
+                    "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
                    exits=1,
                )

@ -133,26 +135,23 @@ def debug_model(
    # STEP 2: Updating the model and printing again
    optimizer = Adam(0.001)
    set_dropout_rate(model, 0.2)
-    # ugly hack to deal with Tok2Vec listeners
-    tok2vec = None
-    if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
-        tok2vec = nlp.get_pipe("tok2vec")
+    # ugly hack to deal with Tok2Vec/Transformer listeners
+    upstream_component = None
+    if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
+        upstream_component = nlp.get_pipe("tok2vec")
+    if model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name:
+        upstream_component = nlp.get_pipe("transformer")
    goldY = None
    for e in range(3):
-        if tok2vec:
-            tok2vec.update([Example.from_dict(x, {}) for x in X])
-        Y, get_dX = model.begin_update(X)
-        if goldY is None:
-            goldY = _simulate_gold(Y)
-        dY = get_gradient(goldY, Y, model.ops)
-        get_dX(dY)
-        model.finish_update(optimizer)
+        if upstream_component:
+            upstream_component.update(examples)
+        pipe.update(examples)
    if print_settings.get("print_after_training"):
        msg.divider(f"STEP 2 - after training")
        _print_model(model, print_settings)

    # STEP 3: the final prediction
-    prediction = model.predict(X)
+    prediction = model.predict([ex.predicted for ex in examples])
    if print_settings.get("print_prediction"):
        msg.divider(f"STEP 3 - prediction")
        msg.info(str(prediction))
@ -160,19 +159,6 @@ def debug_model(
    msg.good(f"Succesfully ended analysis - model looks good.")


-def get_gradient(goldY, Y, ops):
-    return ops.asarray(Y) - ops.asarray(goldY)
-
-
-def _simulate_gold(element, counter=1):
-    if isinstance(element, Iterable):
-        for i in range(len(element)):
-            element[i] = _simulate_gold(element[i], counter + i)
-        return element
-    else:
-        return 1 / counter
-
-
 def _sentences():
    return [
        "Apple is looking at buying U.K. startup for $1 billion",
@ -209,11 +195,7 @@ def _print_model(model, print_settings):

            if dimensions:
                for name in node.dim_names:
-                    if node.has_dim(name):
-                        msg.info(f" - dim {name}: {node.get_dim(name)}")
-                    else:
-                        msg.info(f" - dim {name}: {node.has_dim(name)}")
-
+                    msg.info(f" - dim {name}: {node.maybe_get_dim(name)}")
            if parameters:
                for name in node.param_names:
                    if node.has_param(name):
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -60,7 +60,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
        model_name = model
        if model in OLD_MODEL_SHORTCUTS:
            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
            )
            model_name = OLD_MODEL_SHORTCUTS[model]
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -10,7 +10,8 @@ from jinja2 import Template
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import string_to_list, import_code


 ROOT = Path(__file__).parent / "templates"
@ -70,7 +71,8 @@ def init_fill_config_cli(
    base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
    output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
-    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
+    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    # fmt: on
 ):
    """
@ -82,6 +84,7 @@ def init_fill_config_cli(

    DOCS: https://spacy.io/api/cli#init-fill-config
    """
+    import_code(code_path)
    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)


--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
                "then the new directory will be created for you.",
            )
    if resume_path is not None:
+        if resume_path.is_dir():
+            # This is necessary because Windows gives a Permission Denied when we
+            # try to open the directory later, which is confusing. See #7878
+            msg.fail(
+                "--resume-path should be a weights file, but {resume_path} is a directory.",
+                exits=True,
+            )
        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if not model_name and not epoch_resume:
            msg.fail(
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -16,7 +16,11 @@ gpu_allocator = null

 [nlp]
 lang = "{{ lang }}"
+{%- if "tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or (("textcat" in components or "textcat_multilabel" in components) and optimize == "accuracy") -%}
 {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
+{%- else -%}
+{%- set full_pipeline = components %}
+{%- endif %}
 pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
 batch_size = {{ 128 if hardware == "gpu" else 1000 }}

@ -202,7 +206,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2"

 [components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 {% if has_letters -%}
 attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -68,8 +68,11 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Controls early-stopping. 0 or -1 mean unlimited.
+# Controls early-stopping. 0 disables early stopping.
 patience = 1600
+# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
+# memory and shuffled within the training loop. -1 means stream train corpus
+# rather than loading in memory with no shuffling within the training loop.
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -120,7 +120,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
    doc (Doc): Document do parse.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
-    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
+    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"]))
    if not doc.has_annotation("DEP"):
        warnings.warn(Warnings.W005)
    if options.get("collapse_phrases", False):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -73,8 +73,13 @@ class Warnings:
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed. The languages with lexeme normalization tables "
-            "are currently: {langs}")
+            "package installed and load the table in your config. The "
+            "languages with lexeme normalization tables are currently: "
+            "{langs}\n\nLoad the table in your config with:\n\n"
+            "[initialize.lookups]\n"
+            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "lang = ${{nlp.lang}}\n"
+            "tables = [\"lexeme_norm\"]\n")
    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
            "attribute or operator.")

@ -147,6 +152,15 @@ class Warnings:
            "will be included in the results. For better results, token "
            "patterns should return matches that are each exactly one token "
            "long.")
+    W111 = ("Jupyter notebook detected: if using `prefer_gpu()` or "
+            "`require_gpu()`, include it in the same cell right before "
+            "`spacy.load()` to ensure that the model is loaded on the correct "
+            "device. More information: "
+            "http://spacy.io/usage/v3#jupyter-notebook-gpu")
+    W112 = ("The model specified to use for initial vectors ({name}) has no "
+            "vectors. This is almost certainly a mistake.")
+    W113 = ("Sourced component '{name}' may not work as expected: source "
+            "vectors are not identical to current pipeline vectors.")


@add_codes
@ -321,7 +335,8 @@ class Errors:
            "https://spacy.io/api/top-level#util.filter_spans")
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
            "token can only be part of one entity, so make sure the entities "
-            "you're setting don't overlap.")
+            "you're setting don't overlap. To work with overlapping entities, "
+            "consider using doc.spans instead.")
    E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
            "settings: {opts}")
    E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
@ -486,10 +501,26 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")

    # New errors added in v3.x
-
+    E872 = ("Unable to copy tokenizer from base model due to different "
+            'tokenizer settings: current tokenizer config "{curr_config}" '
+            'vs. base model "{base_config}"')
+    E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
+            "'{text}'. This is likely a bug in spaCy, so feel free to open an "
+            "issue: https://github.com/explosion/spaCy/issues")
+    E874 = ("Could not initialize the tok2vec model from component "
+            "'{component}' and layer '{layer}'.")
+    E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
+            "In the config, these are defined by the initialize.vectors setting.")
+    E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
+            "a list of spans, with each span represented by a tuple (start_char, end_char). "
+            "The tuple can be optionally extended with a label and a KB ID.")
    E880 = ("The 'wandb' library could not be found - did you install it? "
            "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
            "config section, instead of the 'WandbLogger'.")
+    E884 = ("The pipeline could not be initialized because the vectors "
+            "could not be found at '{vectors}'. If your pipeline was already "
+            "initialized/trained before, call 'resume_training' instead of 'initialize', "
+            "or initialize only the components that are new.")
    E885 = ("entity_linker.set_kb received an invalid 'kb_loader' argument: expected "
            "a callable function, but got: {arg_type}")
    E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not "
@ -610,7 +641,7 @@ class Errors:
            "method, make sure it's overwritten on the subclass.")
    E940 = ("Found NaN values in scores.")
    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is deprecated as of spaCy v3.0. To "
+            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
            "load the model, use its full name instead:\n\n"
            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
            "models, see the models directory: https://spacy.io/models. If you "
@ -625,8 +656,8 @@ class Errors:
            "returned the initialized nlp object instead?")
    E944 = ("Can't copy pipeline component '{name}' from source '{model}': "
            "not found in pipeline. Available components: {opts}")
-    E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded "
-            "nlp object, but got: {source}")
+    E945 = ("Can't copy pipeline component '{name}' from source. Expected "
+            "loaded nlp object, but got: {source}")
    E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
            "a string value from {expected} but got: '{arg}'")
    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -58,7 +58,7 @@ GLOSSARY = {
    "FW": "foreign word",
    "HYPH": "punctuation mark, hyphen",
    "IN": "conjunction, subordinating or preposition",
-    "JJ": "adjective",
+    "JJ": "adjective (English), other noun-modifier (Chinese)",
    "JJR": "adjective, comparative",
    "JJS": "adjective, superlative",
    "LS": "list item marker",
@ -88,7 +88,7 @@ GLOSSARY = {
    "WP": "wh-pronoun, personal",
    "WP$": "wh-pronoun, possessive",
    "WRB": "wh-adverb",
-    "SP": "space",
+    "SP": "space (English), sentence-final particle (Chinese)",
    "ADD": "email",
    "NFP": "superfluous punctuation",
    "GW": "additional word in multi-word expression",
@ -152,6 +152,40 @@ GLOSSARY = {
    "VVIZU": 'infinitive with "zu", full',
    "VVPP": "perfect participle, full",
    "XY": "non-word containing non-letter",
+    # POS Tags (Chinese)
+    # OntoNotes / Chinese Penn Treebank
+    # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
+    "AD": "adverb",
+    "AS": "aspect marker",
+    "BA": "把 in ba-construction",
+    # "CD": "cardinal number",
+    "CS": "subordinating conjunction",
+    "DEC": "的 in a relative clause",
+    "DEG": "associative 的",
+    "DER": "得 in V-de const. and V-de-R",
+    "DEV": "地 before VP",
+    "ETC": "for words 等, 等等",
+    # "FW": "foreign words"
+    "IJ": "interjection",
+    # "JJ": "other noun-modifier",
+    "LB": "被 in long bei-const",
+    "LC": "localizer",
+    "M": "measure word",
+    "MSP": "other particle",
+    # "NN": "common noun",
+    "NR": "proper noun",
+    "NT": "temporal noun",
+    "OD": "ordinal number",
+    "ON": "onomatopoeia",
+    "P": "preposition excluding 把 and 被",
+    "PN": "pronoun",
+    "PU": "punctuation",
+    "SB": "被 in short bei-const",
+    # "SP": "sentence-final particle",
+    "VA": "predicative adjective",
+    "VC": "是 (copula)",
+    "VE": "有 as the main verb",
+    "VV": "other verb",
    # Noun chunks
    "NP": "noun phrase",
    "PP": "prepositional phrase",
--- a/spacy/lang/az/init.py
+++ b/spacy/lang/az/init.py
@ -0,0 +1,21 @@
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .lex_attrs import LEX_ATTRS
+from ...language import Language
+
+
+class AzerbaijaniDefaults(Language.Defaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+    token_match = TOKEN_MATCH
+    syntax_iterators = SYNTAX_ITERATORS
+
+
+class Azerbaijani(Language):
+    lang = "az"
+    Defaults = AzerbaijaniDefaults
+
+
+__all__ = ["Azerbaijani"]
--- a/spacy/lang/az/examples.py
+++ b/spacy/lang/az/examples.py
@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.az.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Bu bir cümlədir.",
+    "Necəsən?",
+    "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.",
+    "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.",
+    "Atılan növbəti mərmilər lap yaxınlıqda partladı.",
+    "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.",
+    "Marsda ilk sınaq uçuşu həyata keçirilib.",
+    "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.",
+    "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.",
+]
--- a/spacy/lang/az/lex_attrs.py
+++ b/spacy/lang/az/lex_attrs.py
@ -0,0 +1,89 @@
+from ...attrs import LIKE_NUM
+
+
+# Eleven, twelve etc. are written separate: on bir, on iki
+
+_num_words = [
+    "bir",
+    "iki",
+    "üç",
+    "dörd",
+    "beş",
+    "altı",
+    "yeddi",
+    "səkkiz",
+    "doqquz",
+    "on",
+    "iyirmi",
+    "otuz",
+    "qırx",
+    "əlli",
+    "altmış",
+    "yetmiş",
+    "səksən",
+    "doxsan",
+    "yüz",
+    "min",
+    "milyon",
+    "milyard",
+    "trilyon",
+    "kvadrilyon",
+    "kentilyon",
+]
+
+
+_ordinal_words = [
+    "birinci",
+    "ikinci",
+    "üçüncü",
+    "dördüncü",
+    "beşinci",
+    "altıncı",
+    "yedinci",
+    "səkkizinci",
+    "doqquzuncu",
+    "onuncu",
+    "iyirminci",
+    "otuzuncu",
+    "qırxıncı",
+    "əllinci",
+    "altmışıncı",
+    "yetmişinci",
+    "səksəninci",
+    "doxsanıncı",
+    "yüzüncü",
+    "mininci",
+    "milyonuncu",
+    "milyardıncı",
+    "trilyonuncu",
+    "kvadrilyonuncu",
+    "kentilyonuncu",
+]
+
+_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    # Check cardinal number
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    if text_lower.endswith(_ordinal_endings):
+        if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
+            return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/az/stop_words.py
+++ b/spacy/lang/az/stop_words.py
@ -0,0 +1,145 @@
+# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py
+STOP_WORDS = set(
+    """
+amma
+arasında
+artıq
+ay
+az
+bax
+belə
+beş
+bilər
+bir
+biraz
+biri
+birşey
+biz
+bizim
+bizlər
+bu
+buna
+bundan
+bunların
+bunu
+bunun
+buradan
+bütün
+bəli
+bəlkə
+bəy
+bəzi
+bəzən
+daha
+dedi
+deyil
+dir
+düz
+də
+dək
+dən
+dəqiqə
+edir
+edən
+elə
+et
+etdi
+etmə
+etmək
+faiz
+gilə
+görə
+ha
+haqqında
+harada
+heç
+hə
+həm
+həmin
+həmişə
+hər
+idi
+il
+ildə
+ilk
+ilə
+in
+indi
+istifadə
+isə
+ki
+kim
+kimi
+kimə
+lakin
+lap
+mirşey
+məhz
+mən
+mənə
+niyə
+nə
+nəhayət
+o
+obirisi
+of
+olan
+olar
+olaraq
+oldu
+olduğu
+olmadı
+olmaz
+olmuşdur
+olsun
+olur
+on
+ona
+ondan
+onlar
+onlardan
+onların
+onsuzda
+onu
+onun
+oradan
+qarşı
+qədər
+saat
+sadəcə
+saniyə
+siz
+sizin
+sizlər
+sonra
+səhv
+sən
+sənin
+sənə
+təəssüf
+var
+və
+xan
+xanım
+xeyr
+ya
+yalnız
+yaxşı
+yeddi
+yenə
+yox
+yoxdur
+yoxsa
+yəni
+zaman
+çox
+çünki
+öz
+özü
+üçün
+əgər
+əlbəttə
+ən
+əslində
+""".split()
+)
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@ -35,7 +35,7 @@ def like_num(text: str) -> bool:
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
-    if text_lower.endswith("th"):
+    if text_lower.endswith(("st", "nd", "rd", "th")):
        if text_lower[:-2].isdigit():
            return True
    return False
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@ -17,14 +17,19 @@ _exc = {
 for orth in [
    "..",
    "....",
+    "a.C.",
    "al.",
    "all-path",
    "art.",
    "Art.",
    "artt.",
    "att.",
+    "avv.",
+    "Avv."
    "by-pass",
    "c.d.",
+    "c/c",
+    "C.so",
    "centro-sinistra",
    "check-up",
    "Civ.",
@ -48,6 +53,8 @@ for orth in [
    "prof.",
    "sett.",
    "s.p.a.",
+    "s.n.c",
+    "s.r.l",
    "ss.",
    "St.",
    "tel.",
--- a/spacy/lang/nb/punctuation.py
+++ b/spacy/lang/nb/punctuation.py
@ -27,7 +27,7 @@ _infixes = (
    + [
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -90,12 +90,12 @@ class RussianLemmatizer(Lemmatizer):
            return [string.lower()]
        return list(set([analysis.normal_form for analysis in filtered_analyses]))

-    def lookup_lemmatize(self, token: Token) -> List[str]:
+    def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
-            return analyses[0].normal_form
-        return string
+            return [analyses[0].normal_form]
+        return [string]


 def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
--- a/spacy/language.py
+++ b/spacy/language.py
@ -22,6 +22,7 @@ from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList, _pipe, raise_error
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
 from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
@ -432,9 +433,9 @@ class Language:
        default_config (Dict[str, Any]): Default configuration, describing the
            default values of the factory arguments.
        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
        requires (Iterable[str]): Doc/Token attributes required by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        default_score_weights (Dict[str, float]): The scores to report during
@ -517,9 +518,9 @@ class Language:

        name (str): The name of the component factory.
        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
        requires (Iterable[str]): Doc/Token attributes required by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        func (Optional[Callable]): Factory function if not used as a decorator.
@ -681,9 +682,14 @@ class Language:
        name (str): Optional alternative name to use in current pipeline.
        RETURNS (Tuple[Callable, str]): The component and its factory name.
        """
-        # TODO: handle errors and mismatches (vectors etc.)
-        if not isinstance(source, self.__class__):
+        # Check source type
+        if not isinstance(source, Language):
            raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
+        # Check vectors, with faster checks first
+        if self.vocab.vectors.shape != source.vocab.vectors.shape or \
+                self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
+                self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
+            util.logger.warning(Warnings.W113.format(name=source_name))
        if not source_name in source.component_names:
            raise KeyError(
                Errors.E944.format(
@ -1219,13 +1225,12 @@ class Language:
        before_init = I["before_init"]
        if before_init is not None:
            before_init(self)
-        init_vocab(
-            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
-        )
-        pretrain_cfg = config.get("pretraining")
-        if pretrain_cfg:
-            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
-            init_tok2vec(self, P, I)
+        try:
+            init_vocab(
+                self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
+            )
+        except IOError:
+            raise IOError(Errors.E884.format(vectors=I["vectors"]))
        if self.vocab.vectors.data.shape[1] >= 1:
            ops = get_current_ops()
            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
@ -1244,6 +1249,10 @@ class Language:
                    proc.initialize, p_settings, section="components", name=name
                )
                proc.initialize(get_examples, nlp=self, **p_settings)
+        pretrain_cfg = config.get("pretraining")
+        if pretrain_cfg:
+            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
+            init_tok2vec(self, P, I)
        self._link_components()
        self._optimizer = sgd
        if sgd is not None:
@ -1592,6 +1601,7 @@ class Language:
        # using the nlp.config with all defaults.
        config = util.copy_config(config)
        orig_pipeline = config.pop("components", {})
+        orig_pretraining = config.pop("pretraining", None)
        config["components"] = {}
        if auto_fill:
            filled = registry.fill(config, validate=validate, schema=ConfigSchema)
@ -1599,6 +1609,9 @@ class Language:
            filled = config
        filled["components"] = orig_pipeline
        config["components"] = orig_pipeline
+        if orig_pretraining is not None:
+            filled["pretraining"] = orig_pretraining
+            config["pretraining"] = orig_pretraining
        resolved_nlp = registry.resolve(
            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
        )
@ -1615,6 +1628,10 @@ class Language:
                or lang_cls is not cls
            ):
                raise ValueError(Errors.E943.format(value=type(lang_cls)))
+
+        # Warn about require_gpu usage in jupyter notebook
+        warn_if_jupyter_cupy()
+
        # Note that we don't load vectors here, instead they get loaded explicitly
        # inside stuff like the spacy train function. If we loaded them here,
        # then we would load them twice at runtime: once when we make from config,
@ -1661,7 +1678,16 @@ class Language:
                        # model with the same vocab as the current nlp object
                        source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
                    source_name = pipe_cfg.get("component", pipe_name)
+                    listeners_replaced = False
+                    if "replace_listeners" in pipe_cfg:
+                        for name, proc in source_nlps[model].pipeline:
+                            if source_name in getattr(proc, "listening_components", []):
+                                source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"])
+                                listeners_replaced = True
                    nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
+                    # Delete from cache if listeners were replaced
+                    if listeners_replaced:
+                        del source_nlps[model]
        disabled_pipes = [*config["nlp"]["disabled"], *disable]
        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
        nlp.batch_size = config["nlp"]["batch_size"]
@ -1674,15 +1700,21 @@ class Language:
                )
        # Detect components with listeners that are not frozen consistently
        for name, proc in nlp.pipeline:
-            if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
-                for listener in proc.listening_components:
-                    # If it's a component sourced from another pipeline, we check if
-                    # the tok2vec listeners should be replaced with standalone tok2vec
-                    # models (e.g. so component can be frozen without its performance
-                    # degrading when other components/tok2vec are updated)
-                    paths = sourced.get(listener, {}).get("replace_listeners", [])
-                    if paths:
-                        nlp.replace_listeners(name, listener, paths)
+            # Remove listeners not in the pipeline
+            listener_names = getattr(proc, "listening_components", [])
+            unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
+            for listener_name in unused_listener_names:
+                for listener in proc.listener_map.get(listener_name, []):
+                    proc.remove_listener(listener, listener_name)
+
+            for listener in getattr(proc, "listening_components", []):  # e.g. tok2vec/transformer
+                # If it's a component sourced from another pipeline, we check if
+                # the tok2vec listeners should be replaced with standalone tok2vec
+                # models (e.g. so component can be frozen without its performance
+                # degrading when other components/tok2vec are updated)
+                paths = sourced.get(listener, {}).get("replace_listeners", [])
+                if paths:
+                    nlp.replace_listeners(name, listener, paths)
        return nlp

    def replace_listeners(
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -1,4 +1,4 @@
-from typing import Dict, Any, List, Union, Optional
+from typing import Any, List, Union, Optional
 from pathlib import Path
 import srsly
 from preshed.bloom import BloomFilter
@ -14,16 +14,16 @@ UNSET = object()

 def load_lookups(
    lang: str, tables: List[str], strict: bool = True
-) -> Optional[Dict[str, Any]]:
+) -> 'Lookups':
    """Load the data from the spacy-lookups-data package for a given language,
-    if available. Returns an empty dict if there's no data or if the package
+    if available. Returns an empty `Lookups` container if there's no data or if the package
    is not installed.

    lang (str): The language code (corresponds to entry point exposed by
        the spacy-lookups-data package).
    tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
    strict (bool): Whether to raise an error if a table doesn't exist.
-    RETURNS (Dict[str, Any]): The lookups, keyed by table name.
+    RETURNS (Lookups): The lookups container containing the loaded tables.
    """
    # TODO: import spacy_lookups_data instead of going via entry points here?
    lookups = Lookups()
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -299,7 +299,7 @@ cdef class DependencyMatcher:
        if isinstance(doclike, Doc):
            doc = doclike
        elif isinstance(doclike, Span):
-            doc = doclike.as_doc()
+            doc = doclike.as_doc(copy_user_data=True)
        else:
            raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))

--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@ -46,6 +46,12 @@ cdef struct TokenPatternC:
    int32_t nr_py
    quantifier_t quantifier
    hash_t key
+    int32_t token_idx
+
+
+cdef struct MatchAlignmentC:
+    int32_t token_idx
+    int32_t length


 cdef struct PatternStateC:
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -196,16 +196,24 @@ cdef class Matcher:
                else:
                    yield doc

-    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
        """Find all token sequences matching the supplied pattern.

        doclike (Doc or Span): The document to match over.
        as_spans (bool): Return Span objects with labels instead of (match_id,
            start, end) tuples.
+        allow_missing (bool): Whether to skip checks for missing annotation for
+            attributes included in patterns. Defaults to False.
+        with_alignments (bool): Return match alignment information, which is
+            `List[int]` with length of matched span. Each entry denotes the
+            corresponding index of token pattern. If as_spans is set to True,
+            this setting is ignored.
        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
            to True, a list of Span objects is returned.
+            If with_alignments is set to True and as_spans is set to False,
+            A list of `(match_id, start, end, alignments)` tuples is returned.
        """
        if isinstance(doclike, Doc):
            doc = doclike
@ -215,6 +223,9 @@ cdef class Matcher:
            length = doclike.end - doclike.start
        else:
            raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
+        # Skip alignments calculations if as_spans is set
+        if as_spans:
+            with_alignments = False
        cdef Pool tmp_pool = Pool()
        if not allow_missing:
            for attr in (TAG, POS, MORPH, LEMMA, DEP):
@ -222,7 +233,7 @@ cdef class Matcher:
                    if attr == TAG:
                        pipe = "tagger"
                    elif attr in (POS, MORPH):
-                        pipe = "morphologizer"
+                        pipe = "morphologizer or tagger+attribute_ruler"
                    elif attr == LEMMA:
                        pipe = "lemmatizer"
                    elif attr == DEP:
@ -230,18 +241,20 @@ cdef class Matcher:
                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
                    raise ValueError(error_msg)
        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                extensions=self._extensions, predicates=self._extra_predicates)
+                                extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
        final_matches = []
        pairs_by_id = {}
-        # For each key, either add all matches, or only the filtered, non-overlapping ones
-        for (key, start, end) in matches:
+        # For each key, either add all matches, or only the filtered,
+        # non-overlapping ones this `match` can be either (start, end) or
+        # (start, end, alignments) depending on `with_alignments=` option.
+        for key, *match in matches:
            span_filter = self._filter.get(key)
            if span_filter is not None:
                pairs = pairs_by_id.get(key, [])
-                pairs.append((start,end))
+                pairs.append(match)
                pairs_by_id[key] = pairs
            else:
-                final_matches.append((key, start, end))
+                final_matches.append((key, *match))
        matched = <char*>tmp_pool.alloc(length, sizeof(char))
        empty = <char*>tmp_pool.alloc(length, sizeof(char))
        for key, pairs in pairs_by_id.items():
@ -253,21 +266,47 @@ cdef class Matcher:
                sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
            else:
                raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
-            for (start, end) in sorted_pairs:
+            for match in sorted_pairs:
+                start, end = match[:2]
                assert 0 <= start < end  # Defend against segfaults
                span_len = end-start
                # If no tokens in the span have matched
                if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0:
-                    final_matches.append((key, start, end))
+                    final_matches.append((key, *match))
                    # Mark tokens that have matched
                    memset(&matched[start], 1, span_len * sizeof(matched[0]))
+        if with_alignments:
+            final_matches_with_alignments = final_matches
+            final_matches = [(key, start, end) for key, start, end, alignments in final_matches]
        # perform the callbacks on the filtered set of results
        for i, (key, start, end) in enumerate(final_matches):
            on_match = self._callbacks.get(key, None)
            if on_match is not None:
                on_match(self, doc, i, final_matches)
        if as_spans:
-            return [Span(doc, start, end, label=key) for key, start, end in final_matches]
+            spans = []
+            for key, start, end in final_matches:
+                if isinstance(doclike, Span):
+                    start += doclike.start
+                    end += doclike.start
+                spans.append(Span(doc, start, end, label=key))
+            return spans
+        elif with_alignments:
+            # convert alignments List[Dict[str, int]] --> List[int]
+            final_matches = []
+            # when multiple alignment (belongs to the same length) is found,
+            # keeps the alignment that has largest token_idx
+            for key, start, end, alignments in final_matches_with_alignments:
+                sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False)
+                alignments = [0] * (end-start)
+                for align in sorted_alignments:
+                    if align['length'] >= end-start:
+                        continue
+                    # Since alignments are sorted in order of (length, token_idx)
+                    # this overwrites smaller token_idx when they have same length.
+                    alignments[align['length']] = align['token_idx']
+                final_matches.append((key, start, end, alignments))
+            return final_matches
        else:
            return final_matches

@ -286,9 +325,9 @@ def unpickle_matcher(vocab, patterns, callbacks):
    return matcher


-cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
    """Find matches in a doc, with a compiled array of patterns. Matches are
-    returned as a list of (id, start, end) tuples.
+    returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)

    To augment the compiled patterns, we optionally also take two Python lists.

@ -300,6 +339,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
    """
    cdef vector[PatternStateC] states
    cdef vector[MatchC] matches
+    cdef vector[vector[MatchAlignmentC]] align_states
+    cdef vector[vector[MatchAlignmentC]] align_matches
    cdef PatternStateC state
    cdef int i, j, nr_extra_attr
    cdef Pool mem = Pool()
@ -326,12 +367,14 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
    for i in range(length):
        for j in range(n):
            states.push_back(PatternStateC(patterns[j], i, 0))
-        transition_states(states, matches, predicate_cache,
-            doclike[i], extra_attr_values, predicates)
+        if with_alignments != 0:
+            align_states.resize(states.size())
+        transition_states(states, matches, align_states, align_matches, predicate_cache,
+            doclike[i], extra_attr_values, predicates, with_alignments)
        extra_attr_values += nr_extra_attr
        predicate_cache += len(predicates)
    # Handle matches that end in 0-width patterns
-    finish_states(matches, states)
+    finish_states(matches, states, align_matches, align_states, with_alignments)
    seen = set()
    for i in range(matches.size()):
        match = (
@ -344,16 +387,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
        # first .?, or the second .? -- it doesn't matter, it's just one match.
        # Skip 0-length matches. (TODO: fix algorithm)
        if match not in seen and matches[i].length > 0:
-            output.append(match)
+            if with_alignments != 0:
+                # since the length of align_matches equals to that of match, we can share same 'i'
+                output.append(match + (align_matches[i],))
+            else:
+                output.append(match)
            seen.add(match)
    return output


 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
+                            vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
                            int8_t* cached_py_predicates,
-        Token token, const attr_t* extra_attrs, py_predicates) except *:
+        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
    cdef int q = 0
    cdef vector[PatternStateC] new_states
+    cdef vector[vector[MatchAlignmentC]] align_new_states
    cdef int nr_predicate = len(py_predicates)
    for i in range(states.size()):
        if states[i].pattern.nr_py >= 1:
@ -368,23 +417,39 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
        # it in the states list, because q doesn't advance.
        state = states[i]
        states[q] = state
+        # Separate from states, performance is guaranteed for users who only need basic options (without alignments).
+        # `align_states` always corresponds to `states` 1:1.
+        if with_alignments != 0:
+            align_state = align_states[i]
+            align_states[q] = align_state
        while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND):
+            # Update alignment before the transition of current state
+            # 'MatchAlignmentC' maps 'original token index of current pattern' to 'current matching length'
+            if with_alignments != 0:
+                align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
            if action == RETRY_EXTEND:
                # This handles the 'extend'
                new_states.push_back(
                    PatternStateC(pattern=states[q].pattern, start=state.start,
                                  length=state.length+1))
+                if with_alignments != 0:
+                    align_new_states.push_back(align_states[q])
            if action == RETRY_ADVANCE:
                # This handles the 'advance'
                new_states.push_back(
                    PatternStateC(pattern=states[q].pattern+1, start=state.start,
                                  length=state.length+1))
+                if with_alignments != 0:
+                    align_new_states.push_back(align_states[q])
            states[q].pattern += 1
            if states[q].pattern.nr_py != 0:
                update_predicate_cache(cached_py_predicates,
                    states[q].pattern, token, py_predicates)
            action = get_action(states[q], token.c, extra_attrs,
                                cached_py_predicates)
+        # Update alignment before the transition of current state
+        if with_alignments != 0:
+            align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
        if action == REJECT:
            pass
        elif action == ADVANCE:
@ -397,29 +462,50 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length+1))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
            elif action == MATCH_DOUBLE:
                # push match without last token if length > 0
                if state.length > 0:
                    matches.push_back(
                        MatchC(pattern_id=ent_id, start=state.start,
                                length=state.length))
+                    # MATCH_DOUBLE emits matches twice,
+                    # add one more to align_matches in order to keep 1:1 relationship
+                    if with_alignments != 0:
+                        align_matches.push_back(align_states[q])
                # push match with last token
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length+1))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
            elif action == MATCH_REJECT:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
            elif action == MATCH_EXTEND:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                           length=state.length))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
                states[q].length += 1
                q += 1
    states.resize(q)
    for i in range(new_states.size()):
        states.push_back(new_states[i])
+    # `align_states` always corresponds to `states` 1:1
+    if with_alignments != 0:
+        align_states.resize(q)
+        for i in range(align_new_states.size()):
+            align_states.push_back(align_new_states[i])


 cdef int update_predicate_cache(int8_t* cache,
@ -442,15 +528,27 @@ cdef int update_predicate_cache(int8_t* cache,
                raise ValueError(Errors.E125.format(value=result))


-cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
+cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
+                        vector[vector[MatchAlignmentC]]& align_matches,
+                        vector[vector[MatchAlignmentC]]& align_states,
+                        bint with_alignments) except *:
    """Handle states that end in zero-width patterns."""
    cdef PatternStateC state
+    cdef vector[MatchAlignmentC] align_state
    for i in range(states.size()):
        state = states[i]
+        if with_alignments != 0:
+            align_state = align_states[i]
        while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
+            # Update alignment before the transition of current state
+            if with_alignments != 0:
+                align_state.push_back(MatchAlignmentC(state.pattern.token_idx, state.length))
            is_final = get_is_final(state)
            if is_final:
                ent_id = get_ent_id(state.pattern)
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_state)
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start, length=state.length))
                break
@ -605,7 +703,7 @@ cdef int8_t get_quantifier(PatternStateC state) nogil:
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
    cdef int i, index
-    for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs):
+    for i, (quantifier, spec, extensions, predicates, token_idx) in enumerate(token_specs):
        pattern[i].quantifier = quantifier
        # Ensure attrs refers to a null pointer if nr_attr == 0
        if len(spec) > 0:
@ -626,6 +724,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
            pattern[i].py_predicates[j] = index
        pattern[i].nr_py = len(predicates)
        pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
+        pattern[i].token_idx = token_idx
    i = len(token_specs)
    # Use quantifier to identify final ID pattern node (rather than previous
    # uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
@ -636,6 +735,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
    pattern[i].nr_attr = 1
    pattern[i].nr_extra_attr = 0
    pattern[i].nr_py = 0
+    pattern[i].token_idx = -1
    return pattern


@ -653,7 +753,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
    """This function interprets the pattern, converting the various bits of
    syntactic sugar before we compile it into a struct with init_pattern.

-    We need to split the pattern up into three parts:
+    We need to split the pattern up into four parts:
    * Normal attribute/value pairs, which are stored on either the token or lexeme,
        can be handled directly.
    * Extension attributes are handled specially, as we need to prefetch the
@ -662,13 +762,14 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
        functions and store them. So we store these specially as well.
    * Extension attributes that have extra predicates are stored within the
        extra_predicates.
+    * Token index that this pattern belongs to.
    """
    tokens = []
    string_store = vocab.strings
-    for spec in token_specs:
+    for token_idx, spec in enumerate(token_specs):
        if not spec:
            # Signifier for 'any token'
-            tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
+            tokens.append((ONE, [(NULL_ATTR, 0)], [], [], token_idx))
            continue
        if not isinstance(spec, dict):
            raise ValueError(Errors.E154.format())
@ -677,7 +778,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
        extensions = _get_extensions(spec, string_store, extensions_table)
        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
        for op in ops:
-            tokens.append((op, list(attr_values), list(extensions), list(predicates)))
+            tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
    return tokens


--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -194,7 +194,7 @@ cdef class PhraseMatcher:
                        if attr == TAG:
                            pipe = "tagger"
                        elif attr in (POS, MORPH):
-                            pipe = "morphologizer"
+                            pipe = "morphologizer or tagger+attribute_ruler"
                        elif attr == LEMMA:
                            pipe = "lemmatizer"
                        elif attr == DEP:
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@ -3,8 +3,10 @@ from thinc.api import Model
 from thinc.types import Floats2d

 from ..tokens import Doc
+from ..util import registry


+@registry.layers("spacy.CharEmbed.v1")
 def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
    # nM: Number of dimensions per character. nC: Number of characters.
    return Model(
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -8,10 +8,10 @@ from ...kb import KnowledgeBase, Candidate, get_candidates
 from ...vocab import Vocab


-@registry.architectures.register("spacy.EntityLinker.v1")
+@registry.architectures("spacy.EntityLinker.v1")
 def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
    with Model.define_operators({">>": chain, "**": clone}):
-        token_width = tok2vec.get_dim("nO")
+        token_width = tok2vec.maybe_get_dim("nO")
        output_layer = Linear(nO=nO, nI=token_width)
        model = (
            tok2vec
@ -25,7 +25,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
    return model


-@registry.misc.register("spacy.KBFromFile.v1")
+@registry.misc("spacy.KBFromFile.v1")
 def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
    def kb_from_file(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=1)
@ -35,7 +35,7 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
    return kb_from_file


-@registry.misc.register("spacy.EmptyKB.v1")
+@registry.misc("spacy.EmptyKB.v1")
 def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
    def empty_kb_factory(vocab):
        return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
@ -43,6 +43,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
    return empty_kb_factory


-@registry.misc.register("spacy.CandidateGenerator.v1")
+@registry.misc("spacy.CandidateGenerator.v1")
 def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
    return get_candidates
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -16,11 +16,13 @@ if TYPE_CHECKING:
    from ...tokens import Doc  # noqa: F401


-@registry.architectures.register("spacy.PretrainVectors.v1")
+@registry.architectures("spacy.PretrainVectors.v1")
 def create_pretrain_vectors(
    maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        if vocab.vectors.data.shape[1] == 0:
+            raise ValueError(Errors.E875)
        model = build_cloze_multi_task_model(
            vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
        )
@ -40,7 +42,7 @@ def create_pretrain_vectors(
    return create_vectors_objective


-@registry.architectures.register("spacy.PretrainCharacters.v1")
+@registry.architectures("spacy.PretrainCharacters.v1")
 def create_pretrain_characters(
    maxout_pieces: int, hidden_size: int, n_characters: int
 ) -> Callable[["Vocab", Model], Model]:
@ -134,7 +136,7 @@ def build_cloze_characters_multi_task_model(
 ) -> Model:
    output_layer = chain(
        list2array(),
-        Maxout(hidden_size, nP=maxout_pieces),
+        Maxout(nO=hidden_size, nP=maxout_pieces),
        LayerNorm(nI=hidden_size),
        MultiSoftmax([256] * nr_char, nI=hidden_size),
    )
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -10,7 +10,7 @@ from ..tb_framework import TransitionModel
 from ...tokens import Doc


-@registry.architectures.register("spacy.TransitionBasedParser.v1")
+@registry.architectures("spacy.TransitionBasedParser.v1")
 def transition_parser_v1(
    tok2vec: Model[List[Doc], List[Floats2d]],
    state_type: Literal["parser", "ner"],
@ -31,7 +31,7 @@ def transition_parser_v1(
    )


-@registry.architectures.register("spacy.TransitionBasedParser.v2")
+@registry.architectures("spacy.TransitionBasedParser.v2")
 def transition_parser_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    state_type: Literal["parser", "ner"],
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@ -6,7 +6,7 @@ from ...util import registry
 from ...tokens import Doc


-@registry.architectures.register("spacy.Tagger.v1")
+@registry.architectures("spacy.Tagger.v1")
 def build_tagger_model(
    tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
 ) -> Model[List[Doc], List[Floats2d]]:
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -15,7 +15,7 @@ from ...tokens import Doc
 from .tok2vec import get_tok2vec_width


-@registry.architectures.register("spacy.TextCatCNN.v1")
+@registry.architectures("spacy.TextCatCNN.v1")
 def build_simple_cnn_text_classifier(
    tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
 ) -> Model[List[Doc], Floats2d]:
@ -41,7 +41,7 @@ def build_simple_cnn_text_classifier(
    return model


-@registry.architectures.register("spacy.TextCatBOW.v1")
+@registry.architectures("spacy.TextCatBOW.v1")
 def build_bow_text_classifier(
    exclusive_classes: bool,
    ngram_size: int,
@ -60,7 +60,7 @@ def build_bow_text_classifier(
    return model


-@registry.architectures.register("spacy.TextCatEnsemble.v2")
+@registry.architectures("spacy.TextCatEnsemble.v2")
 def build_text_classifier_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    linear_model: Model[List[Doc], Floats2d],
@ -112,7 +112,7 @@ def init_ensemble_textcat(model, X, Y) -> Model:
    return model


-@registry.architectures.register("spacy.TextCatLowData.v1")
+@registry.architectures("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
    width: int, dropout: Optional[float], nO: Optional[int] = None
 ) -> Model[List[Doc], Floats2d]:
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -14,7 +14,7 @@ from ...pipeline.tok2vec import Tok2VecListener
 from ...attrs import intify_attr


-@registry.architectures.register("spacy.Tok2VecListener.v1")
+@registry.architectures("spacy.Tok2VecListener.v1")
 def tok2vec_listener_v1(width: int, upstream: str = "*"):
    tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
    return tok2vec
@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model):
    return nO


-@registry.architectures.register("spacy.HashEmbedCNN.v1")
+@registry.architectures("spacy.HashEmbedCNN.v2")
 def build_hash_embed_cnn_tok2vec(
    *,
    width: int,
@ -87,7 +87,7 @@ def build_hash_embed_cnn_tok2vec(
    )


-@registry.architectures.register("spacy.Tok2Vec.v2")
+@registry.architectures("spacy.Tok2Vec.v2")
 def build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
    encode: Model[List[Floats2d], List[Floats2d]],
@ -108,7 +108,7 @@ def build_Tok2Vec_model(
    return tok2vec


-@registry.architectures.register("spacy.MultiHashEmbed.v1")
+@registry.architectures("spacy.MultiHashEmbed.v2")
 def MultiHashEmbed(
    width: int,
    attrs: List[Union[str, int]],
@ -182,7 +182,7 @@ def MultiHashEmbed(
    return model


-@registry.architectures.register("spacy.CharacterEmbed.v1")
+@registry.architectures("spacy.CharacterEmbed.v2")
 def CharacterEmbed(
    width: int,
    rows: int,
@ -255,7 +255,7 @@ def CharacterEmbed(
    return model


-@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
+@registry.architectures("spacy.MaxoutWindowEncoder.v2")
 def MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
@ -287,7 +287,7 @@ def MaxoutWindowEncoder(
    return with_array(model, pad=receptive_field)


-@registry.architectures.register("spacy.MishWindowEncoder.v2")
+@registry.architectures("spacy.MishWindowEncoder.v2")
 def MishWindowEncoder(
    width: int, window_size: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
@ -310,7 +310,7 @@ def MishWindowEncoder(
    return with_array(model)


-@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
+@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
 def BiLSTMEncoder(
    width: int, depth: int, dropout: float
 ) -> Model[List[Floats2d], List[Floats2d]]:
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@ -8,7 +8,7 @@ from ..tokens import Doc
 from ..errors import Errors


-@registry.layers("spacy.StaticVectors.v1")
+@registry.layers("spacy.StaticVectors.v2")
 def StaticVectors(
    nO: Optional[int] = None,
    nM: Optional[int] = None,
@ -38,7 +38,7 @@ def forward(
        return _handle_empty(model.ops, model.get_dim("nO"))
    key_attr = model.attrs["key_attr"]
    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
-    V = cast(Floats2d, docs[0].vocab.vectors.data)
+    V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
    rows = model.ops.flatten(
        [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
    )
@ -46,6 +46,8 @@ def forward(
        vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
    except ValueError:
        raise RuntimeError(Errors.E896)
+    # Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
+    vectors_data[rows < 0] = 0
    output = Ragged(
        vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")
    )
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -247,7 +247,7 @@ cdef class BiluoPushDown(TransitionSystem):
        for i in range(state.c._ents.size()):
            ent = state.c._ents.at(i)
            if ent.start != -1 and ent.end != -1:
-                ents.append(Span(doc, ent.start, ent.end, label=ent.label))
+                ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
        doc.set_ents(ents, default="unmodified")
        # Set non-blocked tokens to O
        for i in range(doc.length):
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -24,7 +24,7 @@ maxout_pieces = 2
 use_upper = true

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -26,7 +26,7 @@ default_model_config = """
@architectures = "spacy.EntityLinker.v1"

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 2
@ -300,77 +300,77 @@ class EntityLinker(TrainablePipe):
        for i, doc in enumerate(docs):
            sentences = [s for s in doc.sents]
            if len(doc) > 0:
-                # Looping through each sentence and each entity
-                # This may go wrong if there are entities across sentences - which shouldn't happen normally.
-                for sent_index, sent in enumerate(sentences):
-                    if sent.ents:
-                        # get n_neightbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_index - self.n_sents)
-                        end_sentence = min(
-                            len(sentences) - 1, sent_index + self.n_sents
-                        )
-                        start_token = sentences[start_sentence].start
-                        end_token = sentences[end_sentence].end
-                        sent_doc = doc[start_token:end_token].as_doc()
-                        # currently, the context is the same for each entity in a sentence (should be refined)
-                        xp = self.model.ops.xp
-                        if self.incl_context:
-                            sentence_encoding = self.model.predict([sent_doc])[0]
-                            sentence_encoding_t = sentence_encoding.T
-                            sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                        for ent in sent.ents:
-                            entity_count += 1
-                            if ent.label_ in self.labels_discard:
-                                # ignoring this entity - setting to NIL
-                                final_kb_ids.append(self.NIL)
-                            else:
-                                candidates = self.get_candidates(self.kb, ent)
-                                if not candidates:
-                                    # no prediction possible for this entity - setting to NIL
-                                    final_kb_ids.append(self.NIL)
-                                elif len(candidates) == 1:
-                                    # shortcut for efficiency reasons: take the 1 candidate
-                                    # TODO: thresholding
-                                    final_kb_ids.append(candidates[0].entity_)
-                                else:
-                                    random.shuffle(candidates)
-                                    # set all prior probabilities to 0 if incl_prior=False
-                                    prior_probs = xp.asarray(
-                                        [c.prior_prob for c in candidates]
+                # Looping through each entity (TODO: rewrite)
+                for ent in doc.ents:
+                    sent = ent.sent
+                    sent_index = sentences.index(sent)
+                    assert sent_index >= 0
+                    # get n_neightbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(
+                        len(sentences) - 1, sent_index + self.n_sents
+                    )
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    xp = self.model.ops.xp
+                    if self.incl_context:
+                        sentence_encoding = self.model.predict([sent_doc])[0]
+                        sentence_encoding_t = sentence_encoding.T
+                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                    entity_count += 1
+                    if ent.label_ in self.labels_discard:
+                        # ignoring this entity - setting to NIL
+                        final_kb_ids.append(self.NIL)
+                    else:
+                        candidates = self.get_candidates(self.kb, ent)
+                        if not candidates:
+                            # no prediction possible for this entity - setting to NIL
+                            final_kb_ids.append(self.NIL)
+                        elif len(candidates) == 1:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            # TODO: thresholding
+                            final_kb_ids.append(candidates[0].entity_)
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            prior_probs = xp.asarray(
+                                [c.prior_prob for c in candidates]
+                            )
+                            if not self.incl_prior:
+                                prior_probs = xp.asarray(
+                                    [0.0 for _ in candidates]
+                                )
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(
+                                    entity_encodings, axis=1
+                                )
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
                                    )
-                                    if not self.incl_prior:
-                                        prior_probs = xp.asarray(
-                                            [0.0 for _ in candidates]
-                                        )
-                                    scores = prior_probs
-                                    # add in similarity from the context
-                                    if self.incl_context:
-                                        entity_encodings = xp.asarray(
-                                            [c.entity_vector for c in candidates]
-                                        )
-                                        entity_norm = xp.linalg.norm(
-                                            entity_encodings, axis=1
-                                        )
-                                        if len(entity_encodings) != len(prior_probs):
-                                            raise RuntimeError(
-                                                Errors.E147.format(
-                                                    method="predict",
-                                                    msg="vectors not of equal length",
-                                                )
-                                            )
-                                        # cosine similarity
-                                        sims = xp.dot(
-                                            entity_encodings, sentence_encoding_t
-                                        ) / (sentence_norm * entity_norm)
-                                        if sims.shape != prior_probs.shape:
-                                            raise ValueError(Errors.E161)
-                                        scores = (
-                                            prior_probs + sims - (prior_probs * sims)
-                                        )
-                                    # TODO: thresholding
-                                    best_index = scores.argmax().item()
-                                    best_candidate = candidates[best_index]
-                                    final_kb_ids.append(best_candidate.entity_)
+                                # cosine similarity
+                                sims = xp.dot(
+                                    entity_encodings, sentence_encoding_t
+                                ) / (sentence_norm * entity_norm)
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = (
+                                    prior_probs + sims - (prior_probs * sims)
+                                )
+                            # TODO: thresholding
+                            best_index = scores.argmax().item()
+                            best_candidate = candidates[best_index]
+                            final_kb_ids.append(best_candidate.entity_)
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -194,7 +194,7 @@ class EntityRuler(Pipe):
                all_labels.add(label)
            else:
                all_labels.add(l)
-        return tuple(all_labels)
+        return tuple(sorted(all_labels))

    def initialize(
        self,
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -175,7 +175,7 @@ class Lemmatizer(Pipe):

        DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
        """
-        cache_key = (token.orth, token.pos, token.morph)
+        cache_key = (token.orth, token.pos, token.morph.key)
        if cache_key in self.cache:
            return self.cache[cache_key]
        string = token.text
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -27,7 +27,7 @@ default_model_config = """
@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
-@architectures = "spacy.CharacterEmbed.v1"
+@architectures = "spacy.CharacterEmbed.v2"
 width = 128
 rows = 7000
 nM = 64
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#initialize
        """
        validate_get_examples(get_examples, "Morphologizer.initialize")
+        util.check_lexeme_norms(self.vocab, "morphologizer")
        if labels is not None:
            self.cfg["labels_morph"] = labels["morph"]
            self.cfg["labels_pos"] = labels["pos"]
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -22,7 +22,7 @@ maxout_pieces = 3
 token_vector_width = 96

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -21,7 +21,7 @@ maxout_pieces = 2
 use_upper = true

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -19,7 +19,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1"

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 12
 depth = 1
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
        DOCS: https://spacy.io/api/sentencerecognizer#initialize
        """
        validate_get_examples(get_examples, "SentenceRecognizer.initialize")
+        util.check_lexeme_norms(self.vocab, "senter")
        doc_sample = []
        label_sample = []
        assert self.labels, Errors.E924.format(name=self.name)
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -26,7 +26,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1"

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#initialize
        """
        validate_get_examples(get_examples, "Tagger.initialize")
+        util.check_lexeme_norms(self.vocab, "tagger")
        if labels is not None:
            for tag in labels:
                self.add_label(tag)
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -21,7 +21,7 @@ single_label_default_config = """
@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
 rows = [2000, 2000, 1000, 1000, 1000, 1000]
 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -56,7 +56,7 @@ single_label_cnn_config = """
 exclusive_classes = true

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
@ -88,11 +88,9 @@ subword_features = true
 def make_textcat(
    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
-    """Create a TextCategorizer compoment. The text categorizer predicts categories
-    over a whole document. It can learn one or more labels, and the labels can
-    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
-    (i.e. zero or more labels may be true per doc). The multi-label setting is
-    controlled by the model instance that's provided.
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels are considered
+    to be mutually exclusive (i.e. one true label per doc).

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
@ -317,9 +315,11 @@ class TextCategorizer(TrainablePipe):
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
-        labels: The labels to add to the component, typically generated by the
+        labels (Optional[Iterable[str]]): The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.
+        positive_label (Optional[str]): The positive label for a binary task with exclusive classes,
+            `None` otherwise and by default.

        DOCS: https://spacy.io/api/textcategorizer#initialize
        """
@ -358,13 +358,13 @@ class TextCategorizer(TrainablePipe):
        """
        validate_examples(examples, "TextCategorizer.score")
        self._validate_categories(examples)
+        kwargs.setdefault("threshold", self.cfg["threshold"])
+        kwargs.setdefault("positive_label", self.cfg["positive_label"])
        return Scorer.score_cats(
            examples,
            "cats",
            labels=self.labels,
            multi_label=False,
-            positive_label=self.cfg["positive_label"],
-            threshold=self.cfg["threshold"],
            **kwargs,
        )

--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -21,7 +21,7 @@ multi_label_default_config = """
@architectures = "spacy.Tok2Vec.v1"

 [model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
 rows = [2000, 2000, 1000, 1000, 1000, 1000]
 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -56,7 +56,7 @@ multi_label_cnn_config = """
 exclusive_classes = false

 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
@ -88,11 +88,10 @@ subword_features = true
 def make_multilabel_textcat(
    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
-    """Create a TextCategorizer compoment. The text categorizer predicts categories
-    over a whole document. It can learn one or more labels, and the labels can
-    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
-    (i.e. zero or more labels may be true per doc). The multi-label setting is
-    controlled by the model instance that's provided.
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels are considered
+    to be non-mutually exclusive, which means that there can be zero or more labels
+    per doc).

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
@ -104,7 +103,7 @@ def make_multilabel_textcat(
 class MultiLabel_TextCategorizer(TextCategorizer):
    """Pipeline component for multi-label text classification.

-    DOCS: https://spacy.io/api/multilabel_textcategorizer
+    DOCS: https://spacy.io/api/textcategorizer
    """

    def __init__(
@ -123,7 +122,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".

-        DOCS: https://spacy.io/api/multilabel_textcategorizer#init
+        DOCS: https://spacy.io/api/textcategorizer#init
        """
        self.vocab = vocab
        self.model = model
@ -149,7 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.

-        DOCS: https://spacy.io/api/multilabel_textcategorizer#initialize
+        DOCS: https://spacy.io/api/textcategorizer#initialize
        """
        validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize")
        if labels is None:
@ -173,15 +172,15 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.

-        DOCS: https://spacy.io/api/multilabel_textcategorizer#score
+        DOCS: https://spacy.io/api/textcategorizer#score
        """
        validate_examples(examples, "MultiLabel_TextCategorizer.score")
+        kwargs.setdefault("threshold", self.cfg["threshold"])
        return Scorer.score_cats(
            examples,
            "cats",
            labels=self.labels,
            multi_label=True,
-            threshold=self.cfg["threshold"],
            **kwargs,
        )

--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -11,7 +11,7 @@ from ..errors import Errors

 default_model_config = """
 [model]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):

    def initialize(self, get_examples, nlp=None, labels=None):
        validate_get_examples(get_examples, "Parser.initialize")
-        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
-        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
-            langs = ", ".join(util.LEXEME_NORM_LANGS)
-            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
+        util.check_lexeme_norms(self.vocab, "parser or NER")
        if labels is not None:
            actions = dict(labels)
        else:
--- a/spacy/py.typed
+++ b/spacy/py.typed
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -20,10 +20,16 @@ MISSING_VALUES = frozenset([None, 0, ""])
 class PRFScore:
    """A precision / recall / F score."""

-    def __init__(self) -> None:
-        self.tp = 0
-        self.fp = 0
-        self.fn = 0
+    def __init__(
+        self,
+        *,
+        tp: int = 0,
+        fp: int = 0,
+        fn: int = 0,
+    ) -> None:
+        self.tp = tp
+        self.fp = fp
+        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn
@ -305,6 +311,8 @@ class Scorer:
        *,
        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
        has_annotation: Optional[Callable[[Doc], bool]] = None,
+        labeled: bool = True,
+        allow_overlap: bool = False,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns PRF scores for labeled spans.
@ -317,6 +325,11 @@ class Scorer:
        has_annotation (Optional[Callable[[Doc], bool]]) should return whether a `Doc`
            has annotation for this `attr`. Docs without annotation are skipped for
            scoring purposes.
+        labeled (bool): Whether or not to include label information in
+            the evaluation. If set to 'False', two spans will be considered
+            equal if their start and end match, irrespective of their label.
+        allow_overlap (bool): Whether or not to allow overlapping spans.
+            If set to 'False', the alignment will automatically resolve conflicts.
        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.

@ -345,33 +358,42 @@ class Scorer:
            gold_spans = set()
            pred_spans = set()
            for span in getter(gold_doc, attr):
-                gold_span = (span.label_, span.start, span.end - 1)
+                if labeled:
+                    gold_span = (span.label_, span.start, span.end - 1)
+                else:
+                    gold_span = (span.start, span.end - 1)
                gold_spans.add(gold_span)
-                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+                gold_per_type[span.label_].add(gold_span)
            pred_per_type = {label: set() for label in labels}
-            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
-                pred_spans.add((span.label_, span.start, span.end - 1))
-                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr), allow_overlap):
+                if labeled:
+                    pred_span = (span.label_, span.start, span.end - 1)
+                else:
+                    pred_span = (span.start, span.end - 1)
+                pred_spans.add(pred_span)
+                pred_per_type[span.label_].add(pred_span)
            # Scores per label
-            for k, v in score_per_type.items():
-                if k in pred_per_type:
-                    v.score_set(pred_per_type[k], gold_per_type[k])
+            if labeled:
+                for k, v in score_per_type.items():
+                    if k in pred_per_type:
+                        v.score_set(pred_per_type[k], gold_per_type[k])
            # Score for all labels
            score.score_set(pred_spans, gold_spans)
-        if len(score) > 0:
-            return {
-                f"{attr}_p": score.precision,
-                f"{attr}_r": score.recall,
-                f"{attr}_f": score.fscore,
-                f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-            }
-        else:
-            return {
+        # Assemble final result
+        final_scores = {
                f"{attr}_p": None,
                f"{attr}_r": None,
                f"{attr}_f": None,
-                f"{attr}_per_type": None,
            }
+        if labeled:
+            final_scores[f"{attr}_per_type"] = None
+        if len(score) > 0:
+            final_scores[f"{attr}_p"] = score.precision
+            final_scores[f"{attr}_r"] = score.recall
+            final_scores[f"{attr}_f"] = score.fscore
+            if labeled:
+                final_scores[f"{attr}_per_type"] = {k: v.to_dict() for k, v in score_per_type.items()}
+        return final_scores

    @staticmethod
    def score_clusters(
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -223,7 +223,7 @@ cdef class StringStore:
            it doesn't exist. Paths may be either strings or Path-like objects.
        """
        path = util.ensure_path(path)
-        strings = list(self)
+        strings = sorted(self)
        srsly.write_json(path, strings)

    def from_disk(self, path):
@ -247,7 +247,7 @@ cdef class StringStore:

        RETURNS (bytes): The serialized form of the `StringStore` object.
        """
-        return srsly.json_dumps(list(self))
+        return srsly.json_dumps(sorted(self))

    def from_bytes(self, bytes_data, **kwargs):
        """Load state from a binary string.
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -6,12 +6,14 @@ import logging
 import mock

 from spacy.lang.xx import MultiLanguage
-from spacy.tokens import Doc, Span
+from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH

+from .test_underscore import clean_underscore  # noqa: F401
+

 def test_doc_api_init(en_vocab):
    words = ["a", "b", "c", "d"]
@ -347,15 +349,19 @@ def test_doc_from_array_morph(en_vocab):
    assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]


+@pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
    en_texts_without_empty = [t for t in en_texts if len(t)]
    de_text = "Wie war die Frage?"
    en_docs = [en_tokenizer(text) for text in en_texts]
-    docs_idx = en_texts[0].index("docs")
+    en_docs[0].spans["group"] = [en_docs[0][1:4]]
+    en_docs[2].spans["group"] = [en_docs[2][1:4]]
+    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
    de_doc = de_tokenizer(de_text)
-    expected = (True, None, None, None)
-    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
+    Token.set_extension("is_ambiguous", default=False)
+    en_docs[0][2]._.is_ambiguous = True # docs
+    en_docs[2][3]._.is_ambiguous = True # think
    assert Doc.from_docs([]) is None
    assert de_doc is not Doc.from_docs([de_doc])
    assert str(de_doc) == str(Doc.from_docs([de_doc]))
@ -372,11 +378,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
+    assert m_doc[2]._.is_ambiguous == True
    assert m_doc[9].idx == think_idx
-    with pytest.raises(AttributeError):
-        # not callable, because it was not set via set_extension
-        m_doc[2]._.is_ambiguous
-    assert len(m_doc.user_data) == len(en_docs[0].user_data)  # but it's there
+    assert m_doc[9]._.is_ambiguous == True
+    assert not any([t._.is_ambiguous for t in m_doc[3:8]])
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])

    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
@ -388,6 +395,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])

    m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
@ -399,6 +408,11 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
+
+    # can merge empty docs
+    doc = Doc.from_docs([en_tokenizer("")] * 10)


 def test_doc_api_from_docs_ents(en_tokenizer):
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -452,3 +452,30 @@ def test_retokenize_disallow_zero_length(en_vocab):
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[1:1])
+
+
+def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
+    text = "displaCy is a parse tool built with Javascript"
+    sent_starts = [1, 0, 0, 0, 1, 0, 0, 0]
+    tokens = en_tokenizer(text)
+
+    # merging within a sentence keeps all sentence boundaries
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
+    assert len(list(doc.sents)) == 2
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[1:3])
+    assert len(list(doc.sents)) == 2
+
+    # merging over a sentence boundary unsets it by default
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
+    assert len(list(doc.sents)) == 2
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[3:6])
+    assert doc[3].is_sent_start == None
+
+    # merging over a sentence boundary and setting sent_start
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
+    assert len(list(doc.sents)) == 2
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[3:6], attrs={"sent_start": True})
+    assert len(list(doc.sents)) == 2
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -1,9 +1,11 @@
 import pytest
 from spacy.attrs import ORTH, LENGTH
-from spacy.tokens import Doc, Span
+from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy.util import filter_spans

+from .test_underscore import clean_underscore  # noqa: F401
+

@pytest.fixture
 def doc(en_tokenizer):
@ -12,9 +14,11 @@ def doc(en_tokenizer):
    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
    deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
            "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
+    ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
+            "O", "O", "O", "O", "O"]
    # fmt: on
    tokens = en_tokenizer(text)
-    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents)


@pytest.fixture
@ -218,12 +222,26 @@ def test_span_as_doc(doc):
    assert span_doc is not doc
    assert span_doc[0].idx == 0

+    # partial initial entity is removed
+    assert len(span_doc.ents) == 0

+    # full entity is preserved
+    span_doc = doc[2:10].as_doc()
+    assert len(span_doc.ents) == 1
+
+    # partial final entity is removed
+    span_doc = doc[0:5].as_doc()
+    assert len(span_doc.ents) == 0
+
+
+@pytest.mark.usefixtures("clean_underscore")
 def test_span_as_doc_user_data(doc):
    """Test that the user_data can be preserved (but not by default). """
    my_key = "my_info"
    my_value = 342
    doc.user_data[my_key] = my_value
+    Token.set_extension("is_x", default=False)
+    doc[7]._.is_x = True

    span = doc[4:10]
    span_doc_with = span.as_doc(copy_user_data=True)
@ -232,6 +250,12 @@ def test_span_as_doc_user_data(doc):
    assert doc.user_data.get(my_key, None) is my_value
    assert span_doc_with.user_data.get(my_key, None) is my_value
    assert span_doc_without.user_data.get(my_key, None) is None
+    for i in range(len(span_doc_with)):
+        if i != 3:
+            assert span_doc_with[i]._.is_x is False
+        else:
+            assert span_doc_with[i]._.is_x is True
+    assert not any([t._.is_x for t in span_doc_without])


 def test_span_string_label_kb_id(doc):
--- a/spacy/tests/enable_gpu.py
+++ b/spacy/tests/enable_gpu.py
@ -0,0 +1,3 @@
+from spacy import require_gpu
+
+require_gpu()
--- a/spacy/tests/lang/en/test_text.py
+++ b/spacy/tests/lang/en/test_text.py
@ -56,7 +56,9 @@ def test_lex_attrs_like_number(en_tokenizer, text, match):
    assert tokens[0].like_num == match


-@pytest.mark.parametrize("word", ["third", "Millionth", "100th", "Hundredth"])
+@pytest.mark.parametrize(
+    "word", ["third", "Millionth", "100th", "Hundredth", "23rd", "52nd"]
+)
 def test_en_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)

--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@ -4,7 +4,9 @@ import re
 import copy
 from mock import Mock
 from spacy.matcher import DependencyMatcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Token
+
+from ..doc.test_underscore import clean_underscore  # noqa: F401


@pytest.fixture
@ -344,3 +346,26 @@ def test_dependency_matcher_long_matches(en_vocab, doc):
    matcher = DependencyMatcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add("pattern", [pattern])
+
+
+@pytest.mark.usefixtures("clean_underscore")
+def test_dependency_matcher_span_user_data(en_tokenizer):
+    doc = en_tokenizer("a b c d e")
+    for token in doc:
+        token.head = doc[0]
+        token.dep_ = "a"
+    get_is_c = lambda token: token.text in ("c",)
+    Token.set_extension("is_c", default=False)
+    doc[2]._.is_c = True
+    pattern = [
+        {"RIGHT_ID": "c", "RIGHT_ATTRS": {"_": {"is_c": True}}},
+    ]
+    matcher = DependencyMatcher(en_tokenizer.vocab)
+    matcher.add("C", [pattern])
+    doc_matches = matcher(doc)
+    offset = 1
+    span_matches = matcher(doc[offset:])
+    for doc_match, span_match in zip(sorted(doc_matches), sorted(span_matches)):
+        assert doc_match[0] == span_match[0]
+        for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
+            assert doc_t_i == span_t_i + offset
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -513,6 +513,12 @@ def test_matcher_as_spans(matcher):
    assert matches[1].text == "Java"
    assert matches[1].label_ == "Java"

+    matches = matcher(doc[1:], as_spans=True)
+    assert len(matches) == 1
+    assert isinstance(matches[0], Span)
+    assert matches[0].text == "Java"
+    assert matches[0].label_ == "Java"
+

 def test_matcher_deprecated(matcher):
    doc = Doc(matcher.vocab, words=["hello", "world"])
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -204,3 +204,90 @@ def test_matcher_remove():
    # removing again should throw an error
    with pytest.raises(ValueError):
        matcher.remove("Rule")
+
+
+def test_matcher_with_alignments_greedy_longest(en_vocab):
+    cases = [
+        ("aaab", "a* b", [0, 0, 0, 1]),
+        ("baab", "b a* b", [0, 1, 1, 2]),
+        ("aaab", "a a a b", [0, 1, 2, 3]),
+        ("aaab", "a+ b", [0, 0, 0, 1]),
+        ("aaba", "a+ b a+", [0, 0, 1, 2]),
+        ("aabaa", "a+ b a+", [0, 0, 1, 2, 2]),
+        ("aaba", "a+ b a*", [0, 0, 1, 2]),
+        ("aaaa", "a*", [0, 0, 0, 0]),
+        ("baab", "b a* b b*", [0, 1, 1, 2]),
+        ("aabb", "a* b* a*", [0, 0, 1, 1]),
+        ("aaab", "a+ a+ a b", [0, 1, 2, 3]),
+        ("aaab", "a+ a+ a+ b", [0, 1, 2, 3]),
+        ("aaab", "a+ a a b", [0, 1, 2, 3]),
+        ("aaab", "a+ a a", [0, 1, 2]),
+        ("aaab", "a+ a a?", [0, 1, 2]),
+        ("aaaa", "a a a a a?", [0, 1, 2, 3]),
+        ("aaab", "a+ a b", [0, 0, 1, 2]),
+        ("aaab", "a+ a+ b", [0, 0, 1, 2]),
+    ]
+    for string, pattern_str, result in cases:
+        matcher = Matcher(en_vocab)
+        doc = Doc(matcher.vocab, words=list(string))
+        pattern = []
+        for part in pattern_str.split():
+            if part.endswith("+"):
+                pattern.append({"ORTH": part[0], "OP": "+"})
+            elif part.endswith("*"):
+                pattern.append({"ORTH": part[0], "OP": "*"})
+            elif part.endswith("?"):
+                pattern.append({"ORTH": part[0], "OP": "?"})
+            else:
+                pattern.append({"ORTH": part})
+        matcher.add("PATTERN", [pattern], greedy="LONGEST")
+        matches = matcher(doc, with_alignments=True)
+        n_matches = len(matches)
+
+        _, s, e, expected = matches[0]
+
+        assert expected == result, (string, pattern_str, s, e, n_matches)
+
+
+def test_matcher_with_alignments_nongreedy(en_vocab):
+    cases = [
+        (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
+        (1, "baab", "b a* b", [[0, 1, 1, 2]]),
+        (2, "aaab", "a a a b", [[0, 1, 2, 3]]),
+        (3, "aaab", "a+ b",   [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
+        (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]),
+        (5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]),
+        (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]),
+        (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]),
+        (8, "baab", "b a* b b*", [[0, 1, 1, 2]]),
+        (9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]),
+        (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]),
+        (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]),
+        (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]),
+        (13, "aaab", "a+ a a", [[0, 1, 2]]),
+        (14, "aaab", "a+ a a?", [[0, 1], [0, 1, 2]]),
+        (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
+        (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
+        (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
+    ]
+    for case_id, string, pattern_str, results in cases:
+        matcher = Matcher(en_vocab)
+        doc = Doc(matcher.vocab, words=list(string))
+        pattern = []
+        for part in pattern_str.split():
+            if part.endswith("+"):
+                pattern.append({"ORTH": part[0], "OP": "+"})
+            elif part.endswith("*"):
+                pattern.append({"ORTH": part[0], "OP": "*"})
+            elif part.endswith("?"):
+                pattern.append({"ORTH": part[0], "OP": "?"})
+            else:
+                pattern.append({"ORTH": part})
+
+        matcher.add("PATTERN", [pattern])
+        matches = matcher(doc, with_alignments=True)
+        n_matches = len(matches)
+
+        for _, s, e, expected in matches:
+            assert expected in results, (case_id, string, pattern_str, s, e, n_matches)
+            assert len(expected) == e - s
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -6,15 +6,14 @@ def test_build_dependencies():
    # Check that library requirements are pinned exactly the same across different setup files.
    # TODO: correct checks for numpy rather than ignoring
    libs_ignore_requirements = [
-        "numpy",
        "pytest",
        "pytest-timeout",
        "mock",
        "flake8",
+        "hypothesis",
    ]
    # ignore language-specific packages that shouldn't be installed by all
    libs_ignore_setup = [
-        "numpy",
        "fugashi",
        "natto-py",
        "pythainlp",
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -8,7 +8,7 @@ from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 import logging

@ -358,6 +358,26 @@ def test_overfitting_IO(use_upper):
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)

+    # test that kb_id is preserved
+    test_text = "I like London and London."
+    doc = nlp.make_doc(test_text)
+    doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)]
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+    assert ents[0].kb_id == 1234
+    doc = nlp.get_pipe("ner")(doc)
+    ents = doc.ents
+    assert len(ents) == 2
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+    assert ents[0].kb_id == 1234
+    # ent added by ner has kb_id == 0
+    assert ents[1].text == "London"
+    assert ents[1].label_ == "LOC"
+    assert ents[1].kb_id == 0
+

 def test_beam_ner_scores():
    # Test that we can get confidence values out of the beam_ner pipe
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -230,7 +230,7 @@ def test_el_pipe_configuration(nlp):
    def get_lowercased_candidates(kb, span):
        return kb.get_alias_candidates(span.text.lower())

-    @registry.misc.register("spacy.LowercaseCandidateGenerator.v1")
+    @registry.misc("spacy.LowercaseCandidateGenerator.v1")
    def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
        return get_lowercased_candidates

--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -5,6 +5,7 @@ from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
 from spacy.errors import MatchPatternError
+from thinc.api import NumpyOps, get_current_ops


@pytest.fixture
@ -201,13 +202,14 @@ def test_entity_ruler_overlapping_spans(nlp):

@pytest.mark.parametrize("n_process", [1, 2])
 def test_entity_ruler_multiprocessing(nlp, n_process):
-    texts = ["I enjoy eating Pizza Hut pizza."]
+    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
+        texts = ["I enjoy eating Pizza Hut pizza."]

-    patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
+        patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]

-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
+        ruler = nlp.add_pipe("entity_ruler")
+        ruler.add_patterns(patterns)

-    for doc in nlp.pipe(texts, n_process=2):
-        for ent in doc.ents:
-            assert ent.ent_id_ == "1234"
+        for doc in nlp.pipe(texts, n_process=2):
+            for ent in doc.ents:
+                assert ent.ent_id_ == "1234"
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -1,6 +1,7 @@
 import pytest
 import logging
 import mock
+import pickle
 from spacy import util, registry
 from spacy.lang.en import English
 from spacy.lookups import Lookups
@ -106,6 +107,9 @@ def test_lemmatizer_serialize(nlp):
    doc2 = nlp2.make_doc("coping")
    doc2[0].pos_ = "VERB"
    assert doc2[0].lemma_ == ""
-    doc2 = lemmatizer(doc2)
+    doc2 = lemmatizer2(doc2)
    assert doc2[0].text == "coping"
    assert doc2[0].lemma_ == "cope"
+
+    # Make sure that lemmatizer cache can be pickled
+    b = pickle.dumps(lemmatizer2)
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@ -4,7 +4,7 @@ import numpy
 import pytest
 from numpy.testing import assert_almost_equal
 from spacy.vocab import Vocab
-from thinc.api import NumpyOps, Model, data_validation
+from thinc.api import Model, data_validation, get_current_ops
 from thinc.types import Array2d, Ragged

 from spacy.lang.en import English
@ -13,7 +13,7 @@ from spacy.ml._character_embed import CharacterEmbed
 from spacy.tokens import Doc


-OPS = NumpyOps()
+OPS = get_current_ops()

 texts = ["These are 4 words", "Here just three"]
 l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]
@ -82,7 +82,7 @@ def util_batch_unbatch_docs_list(
        Y_batched = model.predict(in_data)
        Y_not_batched = [model.predict([u])[0] for u in in_data]
        for i in range(len(Y_batched)):
-            assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4)
+            assert_almost_equal(OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4)


 def util_batch_unbatch_docs_array(
@ -91,7 +91,7 @@ def util_batch_unbatch_docs_array(
    with data_validation(True):
        model.initialize(in_data, out_data)
        Y_batched = model.predict(in_data).tolist()
-        Y_not_batched = [model.predict([u])[0] for u in in_data]
+        Y_not_batched = [model.predict([u])[0].tolist() for u in in_data]
        assert_almost_equal(Y_batched, Y_not_batched, decimal=4)


@ -100,8 +100,8 @@ def util_batch_unbatch_docs_ragged(
 ):
    with data_validation(True):
        model.initialize(in_data, out_data)
-        Y_batched = model.predict(in_data)
+        Y_batched = model.predict(in_data).data.tolist()
        Y_not_batched = []
        for u in in_data:
            Y_not_batched.extend(model.predict([u]).data.tolist())
-        assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4)
+        assert_almost_equal(Y_batched, Y_not_batched, decimal=4)
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -1,4 +1,6 @@
 import pytest
+import mock
+import logging
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
@ -402,6 +404,38 @@ def test_pipe_factories_from_source():
        nlp.add_pipe("custom", source=source_nlp)


+def test_pipe_factories_from_source_language_subclass():
+    class CustomEnglishDefaults(English.Defaults):
+        stop_words = set(["custom", "stop"])
+
+    @registry.languages("custom_en")
+    class CustomEnglish(English):
+        lang = "custom_en"
+        Defaults = CustomEnglishDefaults
+
+    source_nlp = English()
+    source_nlp.add_pipe("tagger")
+
+    # custom subclass
+    nlp = CustomEnglish()
+    nlp.add_pipe("tagger", source=source_nlp)
+    assert "tagger" in nlp.pipe_names
+
+    # non-subclass
+    nlp = German()
+    nlp.add_pipe("tagger", source=source_nlp)
+    assert "tagger" in nlp.pipe_names
+
+    # mismatched vectors
+    nlp = English()
+    nlp.vocab.vectors.resize((1, 4))
+    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
+    logger = logging.getLogger("spacy")
+    with mock.patch.object(logger, "warning") as mock_warning:
+        nlp.add_pipe("tagger", source=source_nlp)
+        mock_warning.assert_called()
+
+
 def test_pipe_factories_from_source_custom():
    """Test adding components from a source model with custom components."""
    name = "test_pipe_factories_from_source_custom"
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -1,7 +1,7 @@
 import pytest
 import random
 import numpy.random
-from numpy.testing import assert_equal
+from numpy.testing import assert_almost_equal
 from thinc.api import fix_random_seed
 from spacy import util
 from spacy.lang.en import English
@ -222,8 +222,12 @@ def test_overfitting_IO():
    batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
    batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
    no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
-    assert_equal(batch_cats_1, batch_cats_2)
-    assert_equal(batch_cats_1, no_batch_cats)
+    for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
+    for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)


 def test_overfitting_IO_multi():
@ -270,8 +274,12 @@ def test_overfitting_IO_multi():
    batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
    no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
-    assert_equal(batch_deps_1, batch_deps_2)
-    assert_equal(batch_deps_1, no_batch_deps)
+    for cats_1, cats_2 in zip(batch_deps_1, batch_deps_2):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
+    for cats_1, cats_2 in zip(batch_deps_1, no_batch_deps):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)


 # fmt: off
@ -370,3 +378,51 @@ def test_textcat_evaluation():

    assert scores["cats_micro_p"] == 4 / 5
    assert scores["cats_micro_r"] == 4 / 6
+
+
+def test_textcat_threshold():
+    # Ensure the scorer can be called with a different threshold
+    nlp = English()
+    nlp.add_pipe("textcat")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
+    macro_f = scores["cats_score"]
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"})
+    pos_f = scores["cats_score"]
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+    assert pos_f > macro_f
+
+
+def test_textcat_multi_threshold():
+    # Ensure the scorer can be called with a different threshold
+    nlp = English()
+    nlp.add_pipe("textcat_multilabel")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -8,8 +8,8 @@ from spacy.tokens import Doc
 from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
-from thinc.api import Config
-from numpy.testing import assert_equal
+from thinc.api import Config, get_current_ops
+from numpy.testing import assert_array_equal

 from ..util import get_batch, make_tempdir

@ -160,7 +160,8 @@ def test_tok2vec_listener():

    doc = nlp("Running the pipeline as a whole.")
    doc_tensor = tagger_tok2vec.predict([doc])[0]
-    assert_equal(doc.tensor, doc_tensor)
+    ops = get_current_ops()
+    assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))

    # TODO: should this warn or error?
    nlp.select_pipes(disable="tok2vec")
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -9,6 +9,7 @@ from spacy.language import Language
 from spacy.util import ensure_path, load_model_from_path
 import numpy
 import pickle
+from thinc.api import NumpyOps, get_current_ops

 from ..util import make_tempdir

@ -169,21 +170,22 @@ def test_issue4725_1():


 def test_issue4725_2():
-    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
-    # or because of issues with pickling the NER (cf test_issue4725_1)
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = 1.0
-    data[1] = 2.0
-    vocab.set_vector("cat", data[0])
-    vocab.set_vector("dog", data[1])
-    nlp = English(vocab=vocab)
-    nlp.add_pipe("ner")
-    nlp.initialize()
-    docs = ["Kurt is in London."] * 10
-    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-        pass
+    if isinstance(get_current_ops, NumpyOps):
+        # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+        # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
+        # or because of issues with pickling the NER (cf test_issue4725_1)
+        vocab = Vocab(vectors_name="test_vocab_add_vector")
+        data = numpy.ndarray((5, 3), dtype="f")
+        data[0] = 1.0
+        data[1] = 2.0
+        vocab.set_vector("cat", data[0])
+        vocab.set_vector("dog", data[1])
+        nlp = English(vocab=vocab)
+        nlp.add_pipe("ner")
+        nlp.initialize()
+        docs = ["Kurt is in London."] * 10
+        for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+            pass


 def test_issue4849():
@ -204,10 +206,11 @@ def test_issue4849():
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
    assert count_ents == 2
    # USING 2 PROCESSES
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=2):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
+    if isinstance(get_current_ops, NumpyOps):
+        count_ents = 0
+        for doc in nlp.pipe([text], n_process=2):
+            count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+        assert count_ents == 2


@Language.factory("my_pipe")
@ -239,10 +242,11 @@ def test_issue4903():
    nlp.add_pipe("sentencizer")
    nlp.add_pipe("my_pipe", after="sentencizer")
    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    docs = list(nlp.pipe(text, n_process=2))
-    assert docs[0].text == "I like bananas."
-    assert docs[1].text == "Do you like them?"
-    assert docs[2].text == "No, I prefer wasabi."
+    if isinstance(get_current_ops(), NumpyOps):
+        docs = list(nlp.pipe(text, n_process=2))
+        assert docs[0].text == "I like bananas."
+        assert docs[1].text == "Do you like them?"
+        assert docs[2].text == "No, I prefer wasabi."


 def test_issue4924():
--- a/spacy/tests/regression/test_issue5001-5500.py
+++ b/spacy/tests/regression/test_issue5001-5500.py
@ -6,6 +6,7 @@ from spacy.language import Language
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.vocab import Vocab
 import spacy
+from thinc.api import get_current_ops
 import pytest

 from ...util import make_tempdir
@ -54,16 +55,17 @@ def test_issue5082():
    ruler.add_patterns(patterns)
    parsed_vectors_1 = [t.vector for t in nlp(text)]
    assert len(parsed_vectors_1) == 4
-    numpy.testing.assert_array_equal(parsed_vectors_1[0], array1)
-    numpy.testing.assert_array_equal(parsed_vectors_1[1], array2)
-    numpy.testing.assert_array_equal(parsed_vectors_1[2], array3)
-    numpy.testing.assert_array_equal(parsed_vectors_1[3], array4)
+    ops = get_current_ops()
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4)
    nlp.add_pipe("merge_entities")
    parsed_vectors_2 = [t.vector for t in nlp(text)]
    assert len(parsed_vectors_2) == 3
-    numpy.testing.assert_array_equal(parsed_vectors_2[0], array1)
-    numpy.testing.assert_array_equal(parsed_vectors_2[1], array2)
-    numpy.testing.assert_array_equal(parsed_vectors_2[2], array34)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)


 def test_issue5137():
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@ -1,5 +1,6 @@
 import pytest
-from thinc.api import Config, fix_random_seed
+from numpy.testing import assert_almost_equal
+from thinc.api import Config, fix_random_seed, get_current_ops

 from spacy.lang.en import English
 from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
@ -44,11 +45,12 @@ def test_issue5551(textcat_config):
        nlp.update([Example.from_dict(doc, annots)])
        # Store the result of each iteration
        result = pipe.model.predict([doc])
-        results.append(list(result[0]))
+        results.append(result[0])
    # All results should be the same because of the fixed seed
    assert len(results) == 3
-    assert results[0] == results[1]
-    assert results[0] == results[2]
+    ops = get_current_ops()
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))


 def test_issue5838():
--- a/spacy/tests/regression/test_issue7065.py
+++ b/spacy/tests/regression/test_issue7065.py
@ -1,4 +1,6 @@
+from spacy.kb import KnowledgeBase
 from spacy.lang.en import English
+from spacy.training import Example


 def test_issue7065():
@ -16,3 +18,58 @@ def test_issue7065():
    ent = doc.ents[0]
    assert ent.start < sent0.end < ent.end
    assert sentences.index(ent.sent) == 0
+
+
+def test_issue7065_b():
+    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
+    nlp = English()
+    vector_length = 3
+    nlp.add_pipe("sentencizer")
+
+    text = "Mahler 's Symphony No. 8 was beautiful."
+    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
+    links = {(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
+             (10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
+    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
+    doc = nlp(text)
+    example = Example.from_dict(doc, {"entities": entities, "links": links, "sent_starts": sent_starts})
+    train_examples = [example]
+
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="No. 8",
+            entities=["Q270853"],
+            probabilities=[1.0],
+        )
+        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias(
+            alias="Mahler",
+            entities=["Q7304"],
+            probabilities=[1.0],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker.set_kb(create_kb)
+
+    # train the NEL pipe
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # Add a custom rule-based component to mimick NER
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
+        {"label": "WORK", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    # test the trained model - this should not throw E148
+    doc = nlp(text)
+    assert doc
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -4,7 +4,7 @@ import spacy
 from spacy.lang.en import English
 from spacy.lang.de import German
 from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.util import registry, load_model_from_config, load_config
+from spacy.util import registry, load_model_from_config, load_config, load_config_from_str
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
 from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
@ -160,7 +160,7 @@ subword_features = false
 """


-@registry.architectures.register("my_test_parser")
+@registry.architectures("my_test_parser")
 def my_parser():
    tok2vec = build_Tok2Vec_model(
        MultiHashEmbed(
@ -293,7 +293,7 @@ def test_serialize_parser(parser_config_string):


 def test_config_nlp_roundtrip():
-    """Test that a config prduced by the nlp object passes training config
+    """Test that a config produced by the nlp object passes training config
    validation."""
    nlp = English()
    nlp.add_pipe("entity_ruler")
@ -465,3 +465,32 @@ def test_config_only_resolve_relevant_blocks():
        nlp.initialize()
    nlp.config["initialize"]["lookups"] = None
    nlp.initialize()
+
+
+def test_hyphen_in_config():
+    hyphen_config_str = """
+    [nlp]
+    lang = "en"
+    pipeline = ["my_punctual_component"]
+
+    [components]
+
+    [components.my_punctual_component]
+    factory = "my_punctual_component"
+    punctuation = ["?","-"]
+    """
+
+    @spacy.Language.factory("my_punctual_component")
+    class MyPunctualComponent(object):
+        name = "my_punctual_component"
+
+        def __init__(
+            self,
+            nlp,
+            name,
+            punctuation,
+        ):
+            self.punctuation = punctuation
+
+    nlp = English.from_config(load_config_from_str(hyphen_config_str))
+    assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-']
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -108,7 +108,7 @@ def test_serialize_subclassed_kb():
            super().__init__(vocab, entity_vector_length)
            self.custom_field = custom_field

-    @registry.misc.register("spacy.CustomKB.v1")
+    @registry.misc("spacy.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
    ) -> Callable[["Vocab"], KnowledgeBase]:
--- a/Show More
+++ b/Show More