Merge remote-tracking branch 'upstream/master' into bugfix/revert-token-match

2025-11-07 11:27:37 +03:00 · 2020-05-05 09:25:57 +02:00 · 2020-05-05 09:25:57 +02:00 · 792c8af8cf
commit 792c8af8cf
parent 0c31f03ec5 c045a9c7f6
190 changed files with 11460 additions and 5158 deletions
--- a/.github/contributors/Baciccin.md
+++ b/.github/contributors/Baciccin.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Giovanni Battista Parodi |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  |                          |
+| Date                           | 2020-03-19               |
+| GitHub username                | Baciccin                 |
+| Website (optional)             |                          |
--- a/.github/contributors/MiniLau.md
+++ b/.github/contributors/MiniLau.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |  Desausoi Laurent    |
+| Company name (if applicable)   |          /           |
+| Title or role (if applicable)  |          /           |
+| Date                           |  22 November 2019    |
+| GitHub username                |        MiniLau       |
+| Website (optional)             |          /           |
--- a/.github/contributors/Mlawrence95.md
+++ b/.github/contributors/Mlawrence95.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ x ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |   Mike Lawrence      |
+| Company name (if applicable)   |        NA            |
+| Title or role (if applicable)  |        NA            |
+| Date                           |     April 17, 2020   |
+| GitHub username                |     Mlawrence95      |
+| Website (optional)             |                      |
--- a/.github/contributors/Tiljander.md
+++ b/.github/contributors/Tiljander.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |  Henrik Tiljander    |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |   24/3/2020          |
+| GitHub username                |     Tiljander        |
+| Website (optional)             |                      |
--- a/.github/contributors/YohannesDatasci.md
+++ b/.github/contributors/YohannesDatasci.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |     Yohannes         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |     2020-04-02       |
+| GitHub username                |   YohannesDatasci    |
+| Website (optional)             |                      |
--- a/.github/contributors/chopeen.md
+++ b/.github/contributors/chopeen.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Marek Grzenkowicz    |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020.04.10           |
+| GitHub username                | chopeen              |
+| Website (optional)             |                      |
--- a/.github/contributors/elben10
+++ b/.github/contributors/elben10
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jakob Jul Elben      |
+| Company name (if applicable)   | N/A                  |
+| Title or role (if applicable)  | N/A                  |
+| Date                           | April 16th, 2020     |
+| GitHub username                | elben10              |
+| Website (optional)             | N/A                  |
--- a/.github/contributors/guerda.md
+++ b/.github/contributors/guerda.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Philip Gillißen      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-03-24           |
+| GitHub username                | guerda               |
+| Website (optional)             |                      |
--- a/.github/contributors/jacse.md
+++ b/.github/contributors/jacse.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jacob Lauritzen      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-03-30           |
+| GitHub username                | jacse                |
+| Website (optional)             |                      |
--- a/.github/contributors/koaning.md
+++ b/.github/contributors/koaning.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Vincent D. Warmerdam     |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  | Data Person              |
+| Date                           | 2020-03-01               |
+| GitHub username                | koaning                  |
+| Website (optional)             | https://koaning.io       |
--- a/.github/contributors/laszabine.md
+++ b/.github/contributors/laszabine.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sabine Laszakovits                     |
+| Company name (if applicable)   | Austrian Academy of Sciences                     |
+| Title or role (if applicable)  | Data analyst                     |
+| Date                           | 2020-04-16                     |
+| GitHub username                | laszabine                     |
+| Website (optional)             | https://sabine.laszakovits.net                     |
--- a/.github/contributors/leicmi.md
+++ b/.github/contributors/leicmi.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Michael Leichtfried  |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 30.03.2020           |
+| GitHub username                | leicmi               |
+| Website (optional)             |                      |
--- a/.github/contributors/louisguitton.md
+++ b/.github/contributors/louisguitton.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Louis Guitton        |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-04-25           |
+| GitHub username                | louisguitton         |
+| Website (optional)             | https://guitton.co/  |
--- a/.github/contributors/merrcury.md
+++ b/.github/contributors/merrcury.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Himanshu Garg        |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-03-10           |
+| GitHub username                | merrcury             |
+| Website (optional)             |                      |
--- a/.github/contributors/michael-k.md
+++ b/.github/contributors/michael-k.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Michael Käufl        |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-04-23           |
+| GitHub username                | michael-k            |
+| Website (optional)             |                      |
--- a/.github/contributors/nikhilsaldanha.md
+++ b/.github/contributors/nikhilsaldanha.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Nikhil Saldanha      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-03-17           |
+| GitHub username                | nikhilsaldanha       |
+| Website (optional)             |                      |
--- a/.github/contributors/paoloq.md
+++ b/.github/contributors/paoloq.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Paolo Arduin         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 9 April 2020         |
+| GitHub username                | paoloq               |
+| Website (optional)             |                      |
--- a/.github/contributors/pinealan.md
+++ b/.github/contributors/pinealan.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Alan Chan            |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-03-15           |
+| GitHub username                | pinealan             |
+| Website (optional)             | http://pinealan.xyz  |
--- a/.github/contributors/punitvara.md
+++ b/.github/contributors/punitvara.md
@ -0,0 +1,107 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Punit Vara	            |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  | 		            |
+| Date                           | 2020-04-26               |
+| GitHub username                | punitvara                |
+| Website (optional)             | https://punitvara.com    |
+
--- a/.github/contributors/sabiqueqb.md
+++ b/.github/contributors/sabiqueqb.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sabique Ahammed Lava |
+| Company name (if applicable)   | QBurst               |
+| Title or role (if applicable)  | Senior Engineer      |
+| Date                           | 24 Apr 2020          |
+| GitHub username                | sabiqueqb            |
+| Website (optional)             |                      |
--- a/.github/contributors/sebastienharinck.md
+++ b/.github/contributors/sebastienharinck.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                                        |
+|------------------------------- | -------------------------------------------- |
+| Name                           | Sébastien Harinck                            |
+| Company name (if applicable)   | Odaxiom                                      |
+| Title or role (if applicable)  | ML Engineer                                  |
+| Date                           | 2020-04-15                                   |
+| GitHub username                | sebastienharinck                             |
+| Website (optional)             | [https://odaxiom.com](https://odaxiom.com)   |
--- a/.github/contributors/sloev.md
+++ b/.github/contributors/sloev.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Johannes Valbjørn        |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  |                          |
+| Date                           | 2020-03-13               |
+| GitHub username                | sloev                    |
+| Website (optional)             | https://sloev.github.io  |
--- a/.github/contributors/thomasthiebaud.md
+++ b/.github/contributors/thomasthiebaud.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1.  The term "contribution" or "contributed materials" means any source code,
+    object code, patch, tool, sample, graphic, specification, manual,
+    documentation, or any other material posted or submitted by you to the project.
+
+2.  With respect to any worldwide copyrights, or copyright applications and
+    registrations, in your contribution:
+
+        * you hereby assign to us joint ownership, and to the extent that such
+        assignment is or becomes invalid, ineffective or unenforceable, you hereby
+        grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+        royalty-free, unrestricted license to exercise all rights under those
+        copyrights. This includes, at our option, the right to sublicense these same
+        rights to third parties through multiple levels of sublicensees or other
+        licensing arrangements;
+
+        * you agree that each of us can do all things in relation to your
+        contribution as if each of us were the sole owners, and if one of us makes
+        a derivative work of your contribution, the one who makes the derivative
+        work (or has it made will be the sole owner of that derivative work;
+
+        * you agree that you will not assert any moral rights in your contribution
+        against us, our licensees or transferees;
+
+        * you agree that we may register a copyright in your contribution and
+        exercise all ownership rights associated with it; and
+
+        * you agree that neither of us has any duty to consult with, obtain the
+        consent of, pay or render an accounting to the other for any use or
+        distribution of your contribution.
+
+3.  With respect to any patents you own, or that you can license without payment
+    to any third party, you hereby grant to us a perpetual, irrevocable,
+    non-exclusive, worldwide, no-charge, royalty-free license to:
+
+        * make, have made, use, sell, offer to sell, import, and otherwise transfer
+        your contribution in whole or in part, alone or in combination with or
+        included in any product, work or materials arising out of the project to
+        which your contribution was submitted, and
+
+        * at our option, to sublicense these same rights to third parties through
+        multiple levels of sublicensees or other licensing arrangements.
+
+4.  Except as set out above, you keep all right, title, and interest in your
+    contribution. The rights that you grant to us under these terms are effective
+    on the date you first submitted a contribution to us, even if your submission
+    took place before the date you sign these terms.
+
+5.  You covenant, represent, warrant and agree that:
+
+    - Each contribution that you submit is and shall be an original work of
+      authorship and you can legally grant the rights set out in this SCA;
+
+    - to the best of your knowledge, each contribution will not violate any
+      third party's copyrights, trademarks, patents, or other intellectual
+      property rights; and
+
+    - each contribution shall be in compliance with U.S. export control laws and
+      other applicable export and import laws. You agree to notify us if you
+      become aware of any circumstance which would make any of the foregoing
+      representations inaccurate in any respect. We may publicly disclose your
+      participation in the project, including the fact that you have signed the SCA.
+
+6.  This SCA is governed by the laws of the State of California and applicable
+    U.S. Federal law. Any choice of law rules will not apply.
+
+7.  Please place an “x” on one of the applicable statement below. Please do NOT
+    mark both statements:
+
+        * [x] I am signing on behalf of myself as an individual and no other person
+        or entity, including my employer, has or will have rights with respect to my
+        contributions.
+
+        * [ ] I am signing on behalf of my employer or a legal entity and I have the
+        actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                         | Entry           |
+| ----------------------------- | --------------- |
+| Name                          | Thomas Thiebaud |
+| Company name (if applicable)  |                 |
+| Title or role (if applicable) |                 |
+| Date                          | 2020-04-07      |
+| GitHub username               | thomasthiebaud  |
+| Website (optional)            |                 |
--- a/.github/contributors/tommilligan.md
+++ b/.github/contributors/tommilligan.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1.  The term "contribution" or "contributed materials" means any source code,
+    object code, patch, tool, sample, graphic, specification, manual,
+    documentation, or any other material posted or submitted by you to the project.
+
+2.  With respect to any worldwide copyrights, or copyright applications and
+    registrations, in your contribution:
+
+        * you hereby assign to us joint ownership, and to the extent that such
+        assignment is or becomes invalid, ineffective or unenforceable, you hereby
+        grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+        royalty-free, unrestricted license to exercise all rights under those
+        copyrights. This includes, at our option, the right to sublicense these same
+        rights to third parties through multiple levels of sublicensees or other
+        licensing arrangements;
+
+        * you agree that each of us can do all things in relation to your
+        contribution as if each of us were the sole owners, and if one of us makes
+        a derivative work of your contribution, the one who makes the derivative
+        work (or has it made will be the sole owner of that derivative work;
+
+        * you agree that you will not assert any moral rights in your contribution
+        against us, our licensees or transferees;
+
+        * you agree that we may register a copyright in your contribution and
+        exercise all ownership rights associated with it; and
+
+        * you agree that neither of us has any duty to consult with, obtain the
+        consent of, pay or render an accounting to the other for any use or
+        distribution of your contribution.
+
+3.  With respect to any patents you own, or that you can license without payment
+    to any third party, you hereby grant to us a perpetual, irrevocable,
+    non-exclusive, worldwide, no-charge, royalty-free license to:
+
+        * make, have made, use, sell, offer to sell, import, and otherwise transfer
+        your contribution in whole or in part, alone or in combination with or
+        included in any product, work or materials arising out of the project to
+        which your contribution was submitted, and
+
+        * at our option, to sublicense these same rights to third parties through
+        multiple levels of sublicensees or other licensing arrangements.
+
+4.  Except as set out above, you keep all right, title, and interest in your
+    contribution. The rights that you grant to us under these terms are effective
+    on the date you first submitted a contribution to us, even if your submission
+    took place before the date you sign these terms.
+
+5.  You covenant, represent, warrant and agree that:
+
+    - Each contribution that you submit is and shall be an original work of
+      authorship and you can legally grant the rights set out in this SCA;
+
+    - to the best of your knowledge, each contribution will not violate any
+      third party's copyrights, trademarks, patents, or other intellectual
+      property rights; and
+
+    - each contribution shall be in compliance with U.S. export control laws and
+      other applicable export and import laws. You agree to notify us if you
+      become aware of any circumstance which would make any of the foregoing
+      representations inaccurate in any respect. We may publicly disclose your
+      participation in the project, including the fact that you have signed the SCA.
+
+6.  This SCA is governed by the laws of the State of California and applicable
+    U.S. Federal law. Any choice of law rules will not apply.
+
+7.  Please place an “x” on one of the applicable statement below. Please do NOT
+    mark both statements:
+
+        * [x] I am signing on behalf of myself as an individual and no other person
+        or entity, including my employer, has or will have rights with respect to my
+        contributions.
+
+        * [ ] I am signing on behalf of my employer or a legal entity and I have the
+        actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                         | Entry        |
+| ----------------------------- | ------------ |
+| Name                          | Tom Milligan |
+| Company name (if applicable)  |              |
+| Title or role (if applicable) |              |
+| Date                          | 2020-03-24   |
+| GitHub username               | tommilligan  |
+| Website (optional)            |              |
--- a/.github/contributors/umarbutler.md
+++ b/.github/contributors/umarbutler.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Umar Butler              |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  |                          |
+| Date                           | 2020-04-09               |
+| GitHub username                | umarbutler               |
+| Website (optional)             | https://umarbutler.com   |
--- a/.github/contributors/vondersam.md
+++ b/.github/contributors/vondersam.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                   |
+|------------------------------- | ------------------------|
+| Name                           | Samuel Rodríguez Medina |
+| Company name (if applicable)   |                         |
+| Title or role (if applicable)  | Computational linguist  |
+| Date                           | 28 April 2020           |
+| GitHub username                | vondersam               |
+| Website (optional)             |                         |
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,11 @@ corpora/
 keys/
 *.json.gz

+# Tests
+spacy/tests/package/setup.cfg
+spacy/tests/package/pyproject.toml
+spacy/tests/package/requirements.txt
+
 # Website
 website/.cache/
 website/public/
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)

-Copyright (C) 2016-2019 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2020 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -48,7 +48,7 @@ jobs:
        imageName: 'vs2017-win2016'
        python.version: '3.6'
      Python36Mac:
-        imageName: 'macos-10.13'
+        imageName: 'macos-10.14'
        python.version: '3.6'
      # Don't test on 3.7 for now to speed up builds
      # Python37Linux:
@ -67,7 +67,7 @@ jobs:
        imageName: 'vs2017-win2016'
        python.version: '3.8'
      Python38Mac:
-        imageName: 'macos-10.13'
+        imageName: 'macos-10.14'
        python.version: '3.8'
    maxParallel: 4
  pool:
--- a/bin/wiki_entity_linking/README.md
+++ b/bin/wiki_entity_linking/README.md
@ -1,37 +0,0 @@
-## Entity Linking with Wikipedia and Wikidata
-
-### Step 1: Create a Knowledge Base (KB) and training data
-
-Run  `wikidata_pretrain_kb.py` 
-* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
-  * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
-  * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)
-* You can set the filtering parameters for KB construction:
-  * `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym
-  * `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB
-  * `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB
-* Further parameters to set:
-  * `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`)
-  * `entity_vector_length` (`-v`): length of the pre-trained entity description vectors
-  * `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages)
-
-Quick testing and rerunning: 
-* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything. 
-  * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1`
-* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
-
-
-### Step 2: Train an Entity Linking model
-
-Run  `wikidata_train_entity_linker.py` 
-* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model**
-* Specify the output directory (`-o`) in which the final, trained model will be saved
-* You can set the learning parameters for the EL training:
-  * `epochs` (`-e`): number of training iterations
-  * `dropout` (`-p`): dropout rate
-  * `lr` (`-n`): learning rate
-  * `l2` (`-r`): L2 regularization
-* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively
-  * If not specified, the full dataset will be processed - this may take a LONG time !
-* Further parameters to set:
-  * `labels_discard` (`-l`): NER label types to discard during training
--- a/bin/wiki_entity_linking/init.py
+++ b/bin/wiki_entity_linking/init.py
@ -1,12 +0,0 @@
-TRAINING_DATA_FILE = "gold_entities.jsonl"
-KB_FILE = "kb"
-KB_MODEL_DIR = "nlp_kb"
-OUTPUT_MODEL_DIR = "nlp"
-
-PRIOR_PROB_PATH = "prior_prob.csv"
-ENTITY_DEFS_PATH = "entity_defs.csv"
-ENTITY_FREQ_PATH = "entity_freq.csv"
-ENTITY_ALIAS_PATH = "entity_alias.csv"
-ENTITY_DESCR_PATH = "entity_descriptions.csv"
-
-LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
--- a/bin/wiki_entity_linking/entity_linker_evaluation.py
+++ b/bin/wiki_entity_linking/entity_linker_evaluation.py
@ -1,204 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-import random
-from tqdm import tqdm
-from collections import defaultdict
-
-logger = logging.getLogger(__name__)
-
-
-class Metrics(object):
-    true_pos = 0
-    false_pos = 0
-    false_neg = 0
-
-    def update_results(self, true_entity, candidate):
-        candidate_is_correct = true_entity == candidate
-
-        # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL")
-        # Therefore, if candidate_is_correct then we have a true positive and never a true negative.
-        self.true_pos += candidate_is_correct
-        self.false_neg += not candidate_is_correct
-        if candidate and candidate not in {"", "NIL"}:
-            # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN.
-            self.false_pos += not candidate_is_correct
-
-    def calculate_precision(self):
-        if self.true_pos == 0:
-            return 0.0
-        else:
-            return self.true_pos / (self.true_pos + self.false_pos)
-
-    def calculate_recall(self):
-        if self.true_pos == 0:
-            return 0.0
-        else:
-            return self.true_pos / (self.true_pos + self.false_neg)
-
-    def calculate_fscore(self):
-        p = self.calculate_precision()
-        r = self.calculate_recall()
-        if p + r == 0:
-            return 0.0
-        else:
-            return 2 * p * r / (p + r)
-
-
-class EvaluationResults(object):
-    def __init__(self):
-        self.metrics = Metrics()
-        self.metrics_by_label = defaultdict(Metrics)
-
-    def update_metrics(self, ent_label, true_entity, candidate):
-        self.metrics.update_results(true_entity, candidate)
-        self.metrics_by_label[ent_label].update_results(true_entity, candidate)
-
-    def report_metrics(self, model_name):
-        model_str = model_name.title()
-        recall = self.metrics.calculate_recall()
-        precision = self.metrics.calculate_precision()
-        fscore = self.metrics.calculate_fscore()
-        return (
-            "{}: ".format(model_str)
-            + "F-score = {} | ".format(round(fscore, 3))
-            + "Recall = {} | ".format(round(recall, 3))
-            + "Precision = {} | ".format(round(precision, 3))
-            + "F-score by label = {}".format(
-                {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())}
-            )
-        )
-
-
-class BaselineResults(object):
-    def __init__(self):
-        self.random = EvaluationResults()
-        self.prior = EvaluationResults()
-        self.oracle = EvaluationResults()
-
-    def report_performance(self, model):
-        results = getattr(self, model)
-        return results.report_metrics(model)
-
-    def update_baselines(
-        self,
-        true_entity,
-        ent_label,
-        random_candidate,
-        prior_candidate,
-        oracle_candidate,
-    ):
-        self.oracle.update_metrics(ent_label, true_entity, oracle_candidate)
-        self.prior.update_metrics(ent_label, true_entity, prior_candidate)
-        self.random.update_metrics(ent_label, true_entity, random_candidate)
-
-
-def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None):
-    counts = dict()
-    baseline_results = BaselineResults()
-    context_results = EvaluationResults()
-    combo_results = EvaluationResults()
-
-    for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'):
-        if len(doc) > 0:
-            correct_ents = dict()
-            for entity, kb_dict in gold.links.items():
-                start, end = entity
-                for gold_kb, value in kb_dict.items():
-                    if value:
-                        # only evaluating on positive examples
-                        offset = _offset(start, end)
-                        correct_ents[offset] = gold_kb
-
-            if baseline:
-                _add_baseline(baseline_results, counts, doc, correct_ents, kb)
-
-            if context:
-                # using only context
-                el_pipe.cfg["incl_context"] = True
-                el_pipe.cfg["incl_prior"] = False
-                _add_eval_result(context_results, doc, correct_ents, el_pipe)
-
-                # measuring combined accuracy (prior + context)
-                el_pipe.cfg["incl_context"] = True
-                el_pipe.cfg["incl_prior"] = True
-                _add_eval_result(combo_results, doc, correct_ents, el_pipe)
-
-    if baseline:
-        logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())}))
-        logger.info(baseline_results.report_performance("random"))
-        logger.info(baseline_results.report_performance("prior"))
-        logger.info(baseline_results.report_performance("oracle"))
-
-    if context:
-        logger.info(context_results.report_metrics("context only"))
-        logger.info(combo_results.report_metrics("context and prior"))
-
-
-def _add_eval_result(results, doc, correct_ents, el_pipe):
-    """
-    Evaluate the ent.kb_id_ annotations against the gold standard.
-    Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
-    """
-    try:
-        doc = el_pipe(doc)
-        for ent in doc.ents:
-            ent_label = ent.label_
-            start = ent.start_char
-            end = ent.end_char
-            offset = _offset(start, end)
-            gold_entity = correct_ents.get(offset, None)
-            # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
-            if gold_entity is not None:
-                pred_entity = ent.kb_id_
-                results.update_metrics(ent_label, gold_entity, pred_entity)
-
-    except Exception as e:
-        logging.error("Error assessing accuracy " + str(e))
-
-
-def _add_baseline(baseline_results, counts, doc, correct_ents, kb):
-    """
-    Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound.
-    Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
-    """
-    for ent in doc.ents:
-        ent_label = ent.label_
-        start = ent.start_char
-        end = ent.end_char
-        offset = _offset(start, end)
-        gold_entity = correct_ents.get(offset, None)
-
-        # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
-        if gold_entity is not None:
-            candidates = kb.get_candidates(ent.text)
-            oracle_candidate = ""
-            prior_candidate = ""
-            random_candidate = ""
-            if candidates:
-                scores = []
-
-                for c in candidates:
-                    scores.append(c.prior_prob)
-                    if c.entity_ == gold_entity:
-                        oracle_candidate = c.entity_
-
-                best_index = scores.index(max(scores))
-                prior_candidate = candidates[best_index].entity_
-                random_candidate = random.choice(candidates).entity_
-
-            current_count = counts.get(ent_label, 0)
-            counts[ent_label] = current_count+1
-
-            baseline_results.update_baselines(
-                gold_entity,
-                ent_label,
-                random_candidate,
-                prior_candidate,
-                oracle_candidate,
-            )
-
-
-def _offset(start, end):
-    return "{}_{}".format(start, end)
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@ -1,161 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-
-from spacy.kb import KnowledgeBase
-
-from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-from bin.wiki_entity_linking import wiki_io as io
-
-
-logger = logging.getLogger(__name__)
-
-
-def create_kb(
-    nlp,
-    max_entities_per_alias,
-    min_entity_freq,
-    min_occ,
-    entity_def_path,
-    entity_descr_path,
-    entity_alias_path,
-    entity_freq_path,
-    prior_prob_path,
-    entity_vector_length,
-):
-    # Create the knowledge base from Wikidata entries
-    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
-    entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length)
-    _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path)
-    return kb
-
-
-def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length):
-    # read the mappings from file
-    title_to_id = io.read_title_to_id(entity_def_path)
-    id_to_descr = io.read_id_to_descr(entity_descr_path)
-
-    # check the length of the nlp vectors
-    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
-        input_dim = nlp.vocab.vectors_length
-        logger.info("Loaded pretrained vectors of size %s" % input_dim)
-    else:
-        raise ValueError(
-            "The `nlp` object should have access to pretrained word vectors, "
-            " cf. https://spacy.io/usage/models#languages."
-        )
-
-    logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
-    entity_frequencies = io.read_entity_to_count(entity_freq_path)
-    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
-    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
-        title_to_id,
-        id_to_descr,
-        entity_frequencies,
-        min_entity_freq
-    )
-    logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys())))
-
-    logger.info("Training entity encoder")
-    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
-    encoder.train(description_list=description_list, to_print=True)
-
-    logger.info("Getting entity embeddings")
-    embeddings = encoder.apply_encoder(description_list)
-
-    logger.info("Adding {} entities".format(len(entity_list)))
-    kb.set_entities(
-        entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
-    )
-    return entity_list, filtered_title_to_id
-
-
-def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
-    logger.info("Adding aliases from Wikipedia and Wikidata")
-    _add_aliases(
-        kb,
-        entity_list=entity_list,
-        title_to_id=filtered_title_to_id,
-        max_entities_per_alias=max_entities_per_alias,
-        min_occ=min_occ,
-        prior_prob_path=prior_prob_path,
-    )
-
-
-def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
-                          min_entity_freq: int = 10):
-    filtered_title_to_id = dict()
-    entity_list = []
-    description_list = []
-    frequency_list = []
-    for title, entity in title_to_id.items():
-        freq = entity_frequencies.get(title, 0)
-        desc = id_to_descr.get(entity, None)
-        if desc and freq > min_entity_freq:
-            entity_list.append(entity)
-            description_list.append(desc)
-            frequency_list.append(freq)
-            filtered_title_to_id[title] = entity
-    return filtered_title_to_id, entity_list, description_list, frequency_list
-
-
-def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
-    wp_titles = title_to_id.keys()
-
-    # adding aliases with prior probabilities
-    # we can read this file sequentially, it's sorted by alias, and then by count
-    logger.info("Adding WP aliases")
-    with prior_prob_path.open("r", encoding="utf8") as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-        previous_alias = None
-        total_count = 0
-        counts = []
-        entities = []
-        while line:
-            splits = line.replace("\n", "").split(sep="|")
-            new_alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            if new_alias != previous_alias and previous_alias:
-                # done reading the previous alias --> output
-                if len(entities) > 0:
-                    selected_entities = []
-                    prior_probs = []
-                    for ent_count, ent_string in zip(counts, entities):
-                        if ent_string in wp_titles:
-                            wd_id = title_to_id[ent_string]
-                            p_entity_givenalias = ent_count / total_count
-                            selected_entities.append(wd_id)
-                            prior_probs.append(p_entity_givenalias)
-
-                    if selected_entities:
-                        try:
-                            kb.add_alias(
-                                alias=previous_alias,
-                                entities=selected_entities,
-                                probabilities=prior_probs,
-                            )
-                        except ValueError as e:
-                            logger.error(e)
-                total_count = 0
-                counts = []
-                entities = []
-
-            total_count += count
-
-            if len(entities) < max_entities_per_alias and count >= min_occ:
-                counts.append(count)
-                entities.append(entity)
-            previous_alias = new_alias
-
-            line = prior_file.readline()
-
-
-def read_kb(nlp, kb_file):
-    kb = KnowledgeBase(vocab=nlp.vocab)
-    kb.load_bulk(kb_file)
-    return kb
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@ -1,152 +0,0 @@
-# coding: utf-8
-from random import shuffle
-
-import logging
-import numpy as np
-
-from spacy._ml import zero_init, create_default_optimizer
-from spacy.cli.pretrain import get_cossim_loss
-
-from thinc.v2v import Model
-from thinc.api import chain
-from thinc.neural._classes.affine import Affine
-
-logger = logging.getLogger(__name__)
-
-
-class EntityEncoder:
-    """
-    Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
-    This entity vector will be stored in the KB, for further downstream use in the entity model.
-    """
-
-    DROP = 0
-    BATCH_SIZE = 1000
-
-    # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy
-    MIN_LOSS = 0.01
-
-    # Reasonable default to stop training when things are not improving
-    MAX_NO_IMPROVEMENT = 20
-
-    def __init__(self, nlp, input_dim, desc_width, epochs=5):
-        self.nlp = nlp
-        self.input_dim = input_dim
-        self.desc_width = desc_width
-        self.epochs = epochs
-
-    def apply_encoder(self, description_list):
-        if self.encoder is None:
-            raise ValueError("Can not apply encoder before training it")
-
-        batch_size = 100000
-
-        start = 0
-        stop = min(batch_size, len(description_list))
-        encodings = []
-
-        while start < len(description_list):
-            docs = list(self.nlp.pipe(description_list[start:stop]))
-            doc_embeddings = [self._get_doc_embedding(doc) for doc in docs]
-            enc = self.encoder(np.asarray(doc_embeddings))
-            encodings.extend(enc.tolist())
-
-            start = start + batch_size
-            stop = min(stop + batch_size, len(description_list))
-            logger.info("Encoded: {} entities".format(stop))
-
-        return encodings
-
-    def train(self, description_list, to_print=False):
-        processed, loss = self._train_model(description_list)
-        if to_print:
-            logger.info(
-                "Trained entity descriptions on {} ".format(processed) +
-                "(non-unique) descriptions across {} ".format(self.epochs) +
-                "epochs"
-            )
-            logger.info("Final loss: {}".format(loss))
-
-    def _train_model(self, description_list):
-        best_loss = 1.0
-        iter_since_best = 0
-        self._build_network(self.input_dim, self.desc_width)
-
-        processed = 0
-        loss = 1
-        # copy this list so that shuffling does not affect other functions
-        descriptions = description_list.copy()
-        to_continue = True
-
-        for i in range(self.epochs):
-            shuffle(descriptions)
-
-            batch_nr = 0
-            start = 0
-            stop = min(self.BATCH_SIZE, len(descriptions))
-
-            while to_continue and start < len(descriptions):
-                batch = []
-                for descr in descriptions[start:stop]:
-                    doc = self.nlp(descr)
-                    doc_vector = self._get_doc_embedding(doc)
-                    batch.append(doc_vector)
-
-                loss = self._update(batch)
-                if batch_nr % 25 == 0:
-                    logger.info("loss: {} ".format(loss))
-                processed += len(batch)
-
-                # in general, continue training if we haven't reached our ideal min yet
-                to_continue = loss > self.MIN_LOSS
-
-                # store the best loss and track how long it's been
-                if loss < best_loss:
-                    best_loss = loss
-                    iter_since_best = 0
-                else:
-                    iter_since_best += 1
-
-                # stop learning if we haven't seen improvement since the last few iterations
-                if iter_since_best > self.MAX_NO_IMPROVEMENT:
-                    to_continue = False
-
-                batch_nr += 1
-                start = start + self.BATCH_SIZE
-                stop = min(stop + self.BATCH_SIZE, len(descriptions))
-
-        return processed, loss
-
-    @staticmethod
-    def _get_doc_embedding(doc):
-        indices = np.zeros((len(doc),), dtype="i")
-        for i, word in enumerate(doc):
-            if word.orth in doc.vocab.vectors.key2row:
-                indices[i] = doc.vocab.vectors.key2row[word.orth]
-            else:
-                indices[i] = 0
-        word_vectors = doc.vocab.vectors.data[indices]
-        doc_vector = np.mean(word_vectors, axis=0)
-        return doc_vector
-
-    def _build_network(self, orig_width, hidden_with):
-        with Model.define_operators({">>": chain}):
-            # very simple encoder-decoder model
-            self.encoder = Affine(hidden_with, orig_width)
-            self.model = self.encoder >> zero_init(
-                Affine(orig_width, hidden_with, drop_factor=0.0)
-            )
-        self.sgd = create_default_optimizer(self.model.ops)
-
-    def _update(self, vectors):
-        predictions, bp_model = self.model.begin_update(
-            np.asarray(vectors), drop=self.DROP
-        )
-        loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
-        bp_model(d_scores, sgd=self.sgd)
-        return loss / len(vectors)
-
-    @staticmethod
-    def _get_loss(golds, scores):
-        loss, gradients = get_cossim_loss(scores, golds)
-        return loss, gradients
--- a/bin/wiki_entity_linking/wiki_io.py
+++ b/bin/wiki_entity_linking/wiki_io.py
@ -1,127 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import sys
-import csv
-
-# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
-csv.field_size_limit(min(sys.maxsize, 2147483646))
-
-""" This class provides reading/writing methods for temp files """
-
-
-# Entity definition: WP title -> WD ID #
-def write_title_to_id(entity_def_output, title_to_id):
-    with entity_def_output.open("w", encoding="utf8") as id_file:
-        id_file.write("WP_title" + "|" + "WD_id" + "\n")
-        for title, qid in title_to_id.items():
-            id_file.write(title + "|" + str(qid) + "\n")
-
-
-def read_title_to_id(entity_def_output):
-    title_to_id = dict()
-    with entity_def_output.open("r", encoding="utf8") as id_file:
-        csvreader = csv.reader(id_file, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            title_to_id[row[0]] = row[1]
-    return title_to_id
-
-
-# Entity aliases from WD: WD ID -> WD alias #
-def write_id_to_alias(entity_alias_path, id_to_alias):
-    with entity_alias_path.open("w", encoding="utf8") as alias_file:
-        alias_file.write("WD_id" + "|" + "alias" + "\n")
-        for qid, alias_list in id_to_alias.items():
-            for alias in alias_list:
-                alias_file.write(str(qid) + "|" + alias + "\n")
-
-
-def read_id_to_alias(entity_alias_path):
-    id_to_alias = dict()
-    with entity_alias_path.open("r", encoding="utf8") as alias_file:
-        csvreader = csv.reader(alias_file, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            qid = row[0]
-            alias = row[1]
-            alias_list = id_to_alias.get(qid, [])
-            alias_list.append(alias)
-            id_to_alias[qid] = alias_list
-    return id_to_alias
-
-
-def read_alias_to_id_generator(entity_alias_path):
-    """ Read (aliases, qid) tuples """
-
-    with entity_alias_path.open("r", encoding="utf8") as alias_file:
-        csvreader = csv.reader(alias_file, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            qid = row[0]
-            alias = row[1]
-            yield alias, qid
-
-
-# Entity descriptions from WD: WD ID -> WD alias #
-def write_id_to_descr(entity_descr_output, id_to_descr):
-    with entity_descr_output.open("w", encoding="utf8") as descr_file:
-        descr_file.write("WD_id" + "|" + "description" + "\n")
-        for qid, descr in id_to_descr.items():
-            descr_file.write(str(qid) + "|" + descr + "\n")
-
-
-def read_id_to_descr(entity_desc_path):
-    id_to_desc = dict()
-    with entity_desc_path.open("r", encoding="utf8") as descr_file:
-        csvreader = csv.reader(descr_file, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            id_to_desc[row[0]] = row[1]
-    return id_to_desc
-
-
-# Entity counts from WP: WP title -> count #
-def write_entity_to_count(prior_prob_input, count_output):
-    # Write entity counts for quick access later
-    entity_to_count = dict()
-    total_count = 0
-
-    with prior_prob_input.open("r", encoding="utf8") as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-
-        while line:
-            splits = line.replace("\n", "").split(sep="|")
-            # alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            current_count = entity_to_count.get(entity, 0)
-            entity_to_count[entity] = current_count + count
-
-            total_count += count
-
-            line = prior_file.readline()
-
-    with count_output.open("w", encoding="utf8") as entity_file:
-        entity_file.write("entity" + "|" + "count" + "\n")
-        for entity, count in entity_to_count.items():
-            entity_file.write(entity + "|" + str(count) + "\n")
-
-
-def read_entity_to_count(count_input):
-    entity_to_count = dict()
-    with count_input.open("r", encoding="utf8") as csvfile:
-        csvreader = csv.reader(csvfile, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_count[row[0]] = int(row[1])
-
-    return entity_to_count
--- a/bin/wiki_entity_linking/wiki_namespaces.py
+++ b/bin/wiki_entity_linking/wiki_namespaces.py
@ -1,128 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# List of meta pages in Wikidata, should be kept out of the Knowledge base
-WD_META_ITEMS = [
-    "Q163875",
-    "Q191780",
-    "Q224414",
-    "Q4167836",
-    "Q4167410",
-    "Q4663903",
-    "Q11266439",
-    "Q13406463",
-    "Q15407973",
-    "Q18616576",
-    "Q19887878",
-    "Q22808320",
-    "Q23894233",
-    "Q33120876",
-    "Q42104522",
-    "Q47460393",
-    "Q64875536",
-    "Q66480449",
-]
-
-
-# TODO: add more cases from non-English WP's
-
-# List of prefixes that refer to Wikipedia "file" pages
-WP_FILE_NAMESPACE = ["Bestand", "File"]
-
-# List of prefixes that refer to Wikipedia "category" pages
-WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
-
-# List of prefixes that refer to Wikipedia "meta" pages
-# these will/should be matched ignoring case
-WP_META_NAMESPACE = (
-    WP_FILE_NAMESPACE
-    + WP_CATEGORY_NAMESPACE
-    + [
-        "b",
-        "betawikiversity",
-        "Book",
-        "c",
-        "Commons",
-        "d",
-        "dbdump",
-        "download",
-        "Draft",
-        "Education",
-        "Foundation",
-        "Gadget",
-        "Gadget definition",
-        "Gebruiker",
-        "gerrit",
-        "Help",
-        "Image",
-        "Incubator",
-        "m",
-        "mail",
-        "mailarchive",
-        "media",
-        "MediaWiki",
-        "MediaWiki talk",
-        "Mediawikiwiki",
-        "MediaZilla",
-        "Meta",
-        "Metawikipedia",
-        "Module",
-        "mw",
-        "n",
-        "nost",
-        "oldwikisource",
-        "otrs",
-        "OTRSwiki",
-        "Overleg gebruiker",
-        "outreach",
-        "outreachwiki",
-        "Portal",
-        "phab",
-        "Phabricator",
-        "Project",
-        "q",
-        "quality",
-        "rev",
-        "s",
-        "spcom",
-        "Special",
-        "species",
-        "Strategy",
-        "sulutil",
-        "svn",
-        "Talk",
-        "Template",
-        "Template talk",
-        "Testwiki",
-        "ticket",
-        "TimedText",
-        "Toollabs",
-        "tools",
-        "tswiki",
-        "User",
-        "User talk",
-        "v",
-        "voy",
-        "w",
-        "Wikibooks",
-        "Wikidata",
-        "wikiHow",
-        "Wikinvest",
-        "wikilivres",
-        "Wikimedia",
-        "Wikinews",
-        "Wikipedia",
-        "Wikipedia talk",
-        "Wikiquote",
-        "Wikisource",
-        "Wikispecies",
-        "Wikitech",
-        "Wikiversity",
-        "Wikivoyage",
-        "wikt",
-        "wiktionary",
-        "wmf",
-        "wmania",
-        "WP",
-    ]
-)
--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
@ -1,179 +0,0 @@
-# coding: utf-8
-"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB)
-with specific parameters. Intermediate files are written to disk.
-
-Running the full pipeline on a standard laptop, may take up to 13 hours of processing.
-Use the -p, -d and -s options to speed up processing using the intermediate files
-from a previous run.
-
-For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
-from https://dumps.wikimedia.org/enwiki/latest/
-
-"""
-from __future__ import unicode_literals
-
-import logging
-from pathlib import Path
-import plac
-
-from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd
-from bin.wiki_entity_linking import wiki_io as io
-from bin.wiki_entity_linking import kb_creator
-from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT
-from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH
-import spacy
-from bin.wiki_entity_linking.kb_creator import read_kb
-
-logger = logging.getLogger(__name__)
-
-
-@plac.annotations(
-    wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path),
-    wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path),
-    output_dir=("Output directory", "positional", None, Path),
-    model=("Model name or path, should include pretrained vectors.", "positional", None, str),
-    max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int),
-    min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int),
-    min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int),
-    entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int),
-    loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
-    loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
-    loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
-    descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"),
-    limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
-    limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
-    limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
-    lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str),
-)
-def main(
-    wd_json,
-    wp_xml,
-    output_dir,
-    model,
-    max_per_alias=10,
-    min_freq=20,
-    min_pair=5,
-    entity_vector_length=64,
-    loc_prior_prob=None,
-    loc_entity_defs=None,
-    loc_entity_alias=None,
-    loc_entity_desc=None,
-    descr_from_wp=False,
-    limit_prior=None,
-    limit_train=None,
-    limit_wd=None,
-    lang="en",
-):
-    entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
-    entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
-    entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
-    entity_freq_path = output_dir / ENTITY_FREQ_PATH
-    prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
-    training_entities_path = output_dir / TRAINING_DATA_FILE
-    kb_path = output_dir / KB_FILE
-
-    logger.info("Creating KB with Wikipedia and WikiData")
-
-    # STEP 0: set up IO
-    if not output_dir.exists():
-        output_dir.mkdir(parents=True)
-
-    # STEP 1: Load the NLP object
-    logger.info("STEP 1: Loading NLP model {}".format(model))
-    nlp = spacy.load(model)
-
-    # check the length of the nlp vectors
-    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
-        raise ValueError(
-            "The `nlp` object should have access to pretrained word vectors, "
-            " cf. https://spacy.io/usage/models#languages."
-        )
-
-    # STEP 2: create prior probabilities from WP
-    if not prior_prob_path.exists():
-        # It takes about 2h to process 1000M lines of Wikipedia XML dump
-        logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path))
-        if limit_prior is not None:
-            logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior))
-        wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
-    else:
-        logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path))
-
-    # STEP 3: calculate entity frequencies
-    if not entity_freq_path.exists():
-        logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path))
-        io.write_entity_to_count(prior_prob_path, entity_freq_path)
-    else:
-        logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path))
-
-    # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
-    if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()):
-        # It takes about 10h to process 55M lines of Wikidata JSON dump
-        logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path))
-        if limit_wd is not None:
-            logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd))
-        title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
-            wd_json,
-            limit_wd,
-            to_print=False,
-            lang=lang,
-            parse_descr=(not descr_from_wp),
-        )
-        io.write_title_to_id(entity_defs_path, title_to_id)
-
-        logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path))
-        io.write_id_to_alias(entity_alias_path, id_to_alias)
-
-        if not descr_from_wp:
-            logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path))
-            io.write_id_to_descr(entity_descr_path, id_to_descr)
-    else:
-        logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path))
-        logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path))
-        if not descr_from_wp:
-            logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path))
-
-    # STEP 5: Getting gold entities from Wikipedia
-    if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()):
-        logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path))
-        if limit_train is not None:
-            logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train))
-        wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path,
-                                    training_entities_path, descr_from_wp, limit_train)
-        if descr_from_wp:
-            logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path))
-    else:
-        logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path))
-        if descr_from_wp:
-            logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path))
-
-    # STEP 6: creating the actual KB
-    # It takes ca. 30 minutes to pretrain the entity embeddings
-    if not kb_path.exists():
-        logger.info("STEP 6: Creating the KB at {}".format(kb_path))
-        kb = kb_creator.create_kb(
-            nlp=nlp,
-            max_entities_per_alias=max_per_alias,
-            min_entity_freq=min_freq,
-            min_occ=min_pair,
-            entity_def_path=entity_defs_path,
-            entity_descr_path=entity_descr_path,
-            entity_alias_path=entity_alias_path,
-            entity_freq_path=entity_freq_path,
-            prior_prob_path=prior_prob_path,
-            entity_vector_length=entity_vector_length,
-        )
-        kb.dump(kb_path)
-        logger.info("kb entities: {}".format(kb.get_size_entities()))
-        logger.info("kb aliases: {}".format(kb.get_size_aliases()))
-        nlp.to_disk(output_dir / KB_MODEL_DIR)
-    else:
-        logger.info("STEP 6: KB already exists at {}".format(kb_path))
-
-    logger.info("Done!")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
-    plac.call(main)
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@ -1,154 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import bz2
-import json
-import logging
-
-from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS
-
-logger = logging.getLogger(__name__)
-
-
-def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True):
-    # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
-    # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-
-    site_filter = '{}wiki'.format(lang)
-
-    # filter: currently defined as OR: one hit suffices to be removed from further processing
-    exclude_list = WD_META_ITEMS
-
-    # punctuation
-    exclude_list.extend(["Q1383557", "Q10617810"])
-
-    # letters etc
-    exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"])
-
-    neg_prop_filter = {
-        'P31': exclude_list,    # instance of
-        'P279': exclude_list    # subclass
-    }
-
-    title_to_id = dict()
-    id_to_descr = dict()
-    id_to_alias = dict()
-
-    # parse appropriate fields - depending on what we need in the KB
-    parse_properties = False
-    parse_sitelinks = True
-    parse_labels = False
-    parse_aliases = True
-    parse_claims = True
-
-    with bz2.open(wikidata_file, mode='rb') as file:
-        for cnt, line in enumerate(file):
-            if limit and cnt >= limit:
-                break
-            if cnt % 500000 == 0 and cnt > 0:
-                logger.info("processed {} lines of WikiData JSON dump".format(cnt))
-            clean_line = line.strip()
-            if clean_line.endswith(b","):
-                clean_line = clean_line[:-1]
-            if len(clean_line) > 1:
-                obj = json.loads(clean_line)
-                entry_type = obj["type"]
-
-                if entry_type == "item":
-                    keep = True
-
-                    claims = obj["claims"]
-                    if parse_claims:
-                        for prop, value_set in neg_prop_filter.items():
-                            claim_property = claims.get(prop, None)
-                            if claim_property:
-                                for cp in claim_property:
-                                    cp_id = (
-                                        cp["mainsnak"]
-                                        .get("datavalue", {})
-                                        .get("value", {})
-                                        .get("id")
-                                    )
-                                    cp_rank = cp["rank"]
-                                    if cp_rank != "deprecated" and cp_id in value_set:
-                                        keep = False
-
-                    if keep:
-                        unique_id = obj["id"]
-
-                        if to_print:
-                            print("ID:", unique_id)
-                            print("type:", entry_type)
-
-                        # parsing all properties that refer to other entities
-                        if parse_properties:
-                            for prop, claim_property in claims.items():
-                                cp_dicts = [
-                                    cp["mainsnak"]["datavalue"].get("value")
-                                    for cp in claim_property
-                                    if cp["mainsnak"].get("datavalue")
-                                ]
-                                cp_values = [
-                                    cp_dict.get("id")
-                                    for cp_dict in cp_dicts
-                                    if isinstance(cp_dict, dict)
-                                    if cp_dict.get("id") is not None
-                                ]
-                                if cp_values:
-                                    if to_print:
-                                        print("prop:", prop, cp_values)
-
-                        found_link = False
-                        if parse_sitelinks:
-                            site_value = obj["sitelinks"].get(site_filter, None)
-                            if site_value:
-                                site = site_value["title"]
-                                if to_print:
-                                    print(site_filter, ":", site)
-                                title_to_id[site] = unique_id
-                                found_link = True
-
-                        if parse_labels:
-                            labels = obj["labels"]
-                            if labels:
-                                lang_label = labels.get(lang, None)
-                                if lang_label:
-                                    if to_print:
-                                        print(
-                                            "label (" + lang + "):", lang_label["value"]
-                                        )
-
-                        if found_link and parse_descr:
-                            descriptions = obj["descriptions"]
-                            if descriptions:
-                                lang_descr = descriptions.get(lang, None)
-                                if lang_descr:
-                                    if to_print:
-                                        print(
-                                            "description (" + lang + "):",
-                                            lang_descr["value"],
-                                        )
-                                    id_to_descr[unique_id] = lang_descr["value"]
-
-                        if parse_aliases:
-                            aliases = obj["aliases"]
-                            if aliases:
-                                lang_aliases = aliases.get(lang, None)
-                                if lang_aliases:
-                                    for item in lang_aliases:
-                                        if to_print:
-                                            print(
-                                                "alias (" + lang + "):", item["value"]
-                                            )
-                                        alias_list = id_to_alias.get(unique_id, [])
-                                        alias_list.append(item["value"])
-                                        id_to_alias[unique_id] = alias_list
-
-                        if to_print:
-                            print()
-
-    # log final number of lines processed
-    logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt))
-    return title_to_id, id_to_descr, id_to_alias
-
-
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@ -1,172 +0,0 @@
-# coding: utf-8
-"""Script that takes a previously created Knowledge Base and trains an entity linking
-pipeline. The provided KB directory should hold the kb, the original nlp object and
-its vocab used to create the KB, and a few auxiliary files such as the entity definitions,
-as created by the script `wikidata_create_kb`.
-
-For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
-from https://dumps.wikimedia.org/enwiki/latest/
-"""
-from __future__ import unicode_literals
-
-import random
-import logging
-import spacy
-from pathlib import Path
-import plac
-from tqdm import tqdm
-
-from bin.wiki_entity_linking import wikipedia_processor
-from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR
-from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
-from bin.wiki_entity_linking.kb_creator import read_kb
-
-from spacy.util import minibatch, compounding
-
-logger = logging.getLogger(__name__)
-
-
-@plac.annotations(
-    dir_kb=("Directory with KB, NLP and related files", "positional", None, Path),
-    output_dir=("Output directory", "option", "o", Path),
-    loc_training=("Location to training data", "option", "k", Path),
-    epochs=("Number of training iterations (default 10)", "option", "e", int),
-    dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float),
-    lr=("Learning rate (default 0.005)", "option", "n", float),
-    l2=("L2 regularization", "option", "r", float),
-    train_articles=("# training articles (default 90% of all)", "option", "t", int),
-    dev_articles=("# dev test articles (default 10% of all)", "option", "d", int),
-    labels_discard=("NER labels to discard (default None)", "option", "l", str),
-)
-def main(
-    dir_kb,
-    output_dir=None,
-    loc_training=None,
-    epochs=10,
-    dropout=0.5,
-    lr=0.005,
-    l2=1e-6,
-    train_articles=None,
-    dev_articles=None,
-    labels_discard=None
-):
-    if not output_dir:
-        logger.warning("No output dir specified so no results will be written, are you sure about this ?")
-
-    logger.info("Creating Entity Linker with Wikipedia and WikiData")
-
-    output_dir = Path(output_dir) if output_dir else dir_kb
-    training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
-    nlp_dir = dir_kb / KB_MODEL_DIR
-    kb_path = dir_kb / KB_FILE
-    nlp_output_dir = output_dir / OUTPUT_MODEL_DIR
-
-    # STEP 0: set up IO
-    if not output_dir.exists():
-        output_dir.mkdir()
-
-    # STEP 1 : load the NLP object
-    logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
-    nlp = spacy.load(nlp_dir)
-    logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
-
-    # check that there is a NER component in the pipeline
-    if "ner" not in nlp.pipe_names:
-        raise ValueError("The `nlp` object should have a pretrained `ner` component.")
-
-    logger.info("STEP 1b: Loading KB from {}".format(kb_path))
-    kb = read_kb(nlp, kb_path)
-
-    # STEP 2: read the training dataset previously created from WP
-    logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path))
-    train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path)
-    logger.info("Training set has {} articles, limit set to roughly {} articles per epoch"
-                .format(len(train_indices), train_articles if train_articles else "all"))
-    logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation"
-                .format(len(dev_indices), dev_articles if dev_articles else "all"))
-    if dev_articles:
-        dev_indices = dev_indices[0:dev_articles]
-
-    # STEP 3: create and train an entity linking pipe
-    logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs))
-    if labels_discard:
-        labels_discard = [x.strip() for x in labels_discard.split(",")]
-        logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard))
-    else:
-        labels_discard = []
-
-    el_pipe = nlp.create_pipe(
-        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
-                                      "labels_discard": labels_discard}
-    )
-    el_pipe.set_kb(kb)
-    nlp.add_pipe(el_pipe, last=True)
-
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
-    with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
-        optimizer = nlp.begin_training()
-        optimizer.learn_rate = lr
-        optimizer.L2 = l2
-
-    logger.info("Dev Baseline Accuracies:")
-    dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
-                                                      dev=True, line_ids=dev_indices,
-                                                      kb=kb, labels_discard=labels_discard)
-
-    measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices))
-
-    for itn in range(epochs):
-        random.shuffle(train_indices)
-        losses = {}
-        batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001))
-        batchnr = 0
-        articles_processed = 0
-
-        # we either process the whole training file, or just a part each epoch
-        bar_total = len(train_indices)
-        if train_articles:
-            bar_total = train_articles
-
-        with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar:
-            for batch in batches:
-                if not train_articles or articles_processed < train_articles:
-                    with nlp.disable_pipes("entity_linker"):
-                        train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
-                                                                             dev=False, line_ids=batch,
-                                                                             kb=kb, labels_discard=labels_discard)
-                        docs, golds = zip(*train_batch)
-                    try:
-                        with nlp.disable_pipes(*other_pipes):
-                            nlp.update(
-                                docs=docs,
-                                golds=golds,
-                                sgd=optimizer,
-                                drop=dropout,
-                                losses=losses,
-                            )
-                            batchnr += 1
-                            articles_processed += len(docs)
-                            pbar.update(len(docs))
-                    except Exception as e:
-                        logger.error("Error updating batch:" + str(e))
-        if batchnr > 0:
-            logging.info("Epoch {} trained on {} articles, train loss {}"
-                         .format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2)))
-            # re-read the dev_data (data is returned as a generator)
-            dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
-                                                              dev=True, line_ids=dev_indices,
-                                                              kb=kb, labels_discard=labels_discard)
-            measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices))
-
-    if output_dir:
-        # STEP 4: write the NLP pipeline (now including an EL model) to file
-        logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
-        logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir))
-        nlp.to_disk(nlp_output_dir)
-
-        logger.info("Done!")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
-    plac.call(main)
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@ -1,565 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import bz2
-import logging
-import random
-import json
-
-from spacy.gold import GoldParse
-from bin.wiki_entity_linking import wiki_io as io
-from bin.wiki_entity_linking.wiki_namespaces import (
-    WP_META_NAMESPACE,
-    WP_FILE_NAMESPACE,
-    WP_CATEGORY_NAMESPACE,
-)
-
-"""
-Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
-Write these results to file for downstream KB and training data generation.
-
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
-"""
-
-ENTITY_FILE = "gold_entities.csv"
-
-map_alias_to_link = dict()
-
-logger = logging.getLogger(__name__)
-
-title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
-id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
-text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
-info_regex = re.compile(r"{[^{]*?}")
-html_regex = re.compile(r"&lt;!--[^-]*--&gt;")
-ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
-ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy
-
-# find the links
-link_regex = re.compile(r"\[\[[^\[\]]*\]\]")
-
-# match on interwiki links, e.g. `en:` or `:fr:`
-ns_regex = r":?" + "[a-z][a-z]" + ":"
-# match on Namespace: optionally preceded by a :
-for ns in WP_META_NAMESPACE:
-    ns_regex += "|" + ":?" + ns + ":"
-ns_regex = re.compile(ns_regex, re.IGNORECASE)
-
-files = r""
-for f in WP_FILE_NAMESPACE:
-    files += "\[\[" + f + ":[^[\]]+]]" + "|"
-files = files[0 : len(files) - 1]
-file_regex = re.compile(files)
-
-cats = r""
-for c in WP_CATEGORY_NAMESPACE:
-    cats += "\[\[" + c + ":[^\[]*]]" + "|"
-cats = cats[0 : len(cats) - 1]
-category_regex = re.compile(cats)
-
-
-def read_prior_probs(wikipedia_input, prior_prob_output, limit=None):
-    """
-    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
-    The full file takes about 2-3h to parse 1100M lines.
-    It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from,
-    though dev test articles are excluded in order not to get an artificially strong baseline.
-    """
-    cnt = 0
-    read_id = False
-    current_article_id = None
-    with bz2.open(wikipedia_input, mode="rb") as file:
-        line = file.readline()
-        while line and (not limit or cnt < limit):
-            if cnt % 25000000 == 0 and cnt > 0:
-                logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
-            clean_line = line.strip().decode("utf-8")
-
-            # we attempt at reading the article's ID (but not the revision or contributor ID)
-            if "<revision>" in clean_line or "<contributor>" in clean_line:
-                read_id = False
-            if "<page>" in clean_line:
-                read_id = True
-
-            if read_id:
-                ids = id_regex.search(clean_line)
-                if ids:
-                    current_article_id = ids[0]
-
-            # only processing prior probabilities from true training (non-dev) articles
-            if not is_dev(current_article_id):
-                aliases, entities, normalizations = get_wp_links(clean_line)
-                for alias, entity, norm in zip(aliases, entities, normalizations):
-                    _store_alias(
-                        alias, entity, normalize_alias=norm, normalize_entity=True
-                    )
-
-            line = file.readline()
-            cnt += 1
-        logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
-    logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))
-
-    # write all aliases and their entities and count occurrences to file
-    with prior_prob_output.open("w", encoding="utf8") as outputfile:
-        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
-        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
-            s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
-            for entity, count in s_dict:
-                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
-
-
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
-    alias = alias.strip()
-    entity = entity.strip()
-
-    # remove everything after # as this is not part of the title but refers to a specific paragraph
-    if normalize_entity:
-        # wikipedia titles are always capitalized
-        entity = _capitalize_first(entity.split("#")[0])
-    if normalize_alias:
-        alias = alias.split("#")[0]
-
-    if alias and entity:
-        alias_dict = map_alias_to_link.get(alias, dict())
-        entity_count = alias_dict.get(entity, 0)
-        alias_dict[entity] = entity_count + 1
-        map_alias_to_link[alias] = alias_dict
-
-
-def get_wp_links(text):
-    aliases = []
-    entities = []
-    normalizations = []
-
-    matches = link_regex.findall(text)
-    for match in matches:
-        match = match[2:][:-2].replace("_", " ").strip()
-
-        if ns_regex.match(match):
-            pass  # ignore the entity if it points to a "meta" page
-
-        # this is a simple [[link]], with the alias the same as the mention
-        elif "|" not in match:
-            aliases.append(match)
-            entities.append(match)
-            normalizations.append(True)
-
-        # in wiki format, the link is written as [[entity|alias]]
-        else:
-            splits = match.split("|")
-            entity = splits[0].strip()
-            alias = splits[1].strip()
-            # specific wiki format  [[alias (specification)|]]
-            if len(alias) == 0 and "(" in entity:
-                alias = entity.split("(")[0]
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-            else:
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-
-    return aliases, entities, normalizations
-
-
-def _capitalize_first(text):
-    if not text:
-        return None
-    result = text[0].capitalize()
-    if len(result) > 0:
-        result += text[1:]
-    return result
-
-
-def create_training_and_desc(
-    wp_input, def_input, desc_output, training_output, parse_desc, limit=None
-):
-    wp_to_id = io.read_title_to_id(def_input)
-    _process_wikipedia_texts(
-        wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
-    )
-
-
-def _process_wikipedia_texts(
-    wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
-):
-    """
-    Read the XML wikipedia data to parse out training data:
-    raw text data + positive instances
-    """
-
-    read_ids = set()
-
-    with output.open("a", encoding="utf8") as descr_file, training_output.open(
-        "w", encoding="utf8"
-    ) as entity_file:
-        if parse_descriptions:
-            _write_training_description(descr_file, "WD_id", "description")
-        with bz2.open(wikipedia_input, mode="rb") as file:
-            article_count = 0
-            article_text = ""
-            article_title = None
-            article_id = None
-            reading_text = False
-            reading_revision = False
-
-            for line in file:
-                clean_line = line.strip().decode("utf-8")
-
-                if clean_line == "<revision>":
-                    reading_revision = True
-                elif clean_line == "</revision>":
-                    reading_revision = False
-
-                # Start reading new page
-                if clean_line == "<page>":
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                # finished reading this page
-                elif clean_line == "</page>":
-                    if article_id:
-                        clean_text, entities = _process_wp_text(
-                            article_title, article_text, wp_to_id
-                        )
-                        if clean_text is not None and entities is not None:
-                            _write_training_entities(
-                                entity_file, article_id, clean_text, entities
-                            )
-
-                            if article_title in wp_to_id and parse_descriptions:
-                                description = " ".join(
-                                    clean_text[:1000].split(" ")[:-1]
-                                )
-                                _write_training_description(
-                                    descr_file, wp_to_id[article_title], description
-                                )
-                            article_count += 1
-                            if article_count % 10000 == 0 and article_count > 0:
-                                logger.info(
-                                    "Processed {} articles".format(article_count)
-                                )
-                            if limit and article_count >= limit:
-                                break
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                    reading_text = False
-                    reading_revision = False
-
-                # start reading text within a page
-                if "<text" in clean_line:
-                    reading_text = True
-
-                if reading_text:
-                    article_text += " " + clean_line
-
-                # stop reading text within a page (we assume a new page doesn't start on the same line)
-                if "</text" in clean_line:
-                    reading_text = False
-
-                # read the ID of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    ids = id_regex.search(clean_line)
-                    if ids:
-                        article_id = ids[0]
-                        if article_id in read_ids:
-                            logger.info(
-                                "Found duplicate article ID", article_id, clean_line
-                            )  # This should never happen ...
-                        read_ids.add(article_id)
-
-                # read the title of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    titles = title_regex.search(clean_line)
-                    if titles:
-                        article_title = titles[0].strip()
-    logger.info("Finished. Processed {} articles".format(article_count))
-
-
-def _process_wp_text(article_title, article_text, wp_to_id):
-    # ignore meta Wikipedia pages
-    if ns_regex.match(article_title):
-        return None, None
-
-    # remove the text tags
-    text_search = text_regex.search(article_text)
-    if text_search is None:
-        return None, None
-    text = text_search.group(0)
-
-    # stop processing if this is a redirect page
-    if text.startswith("#REDIRECT"):
-        return None, None
-
-    # get the raw text without markup etc, keeping only interwiki links
-    clean_text, entities = _remove_links(_get_clean_wp_text(text), wp_to_id)
-    return clean_text, entities
-
-
-def _get_clean_wp_text(article_text):
-    clean_text = article_text.strip()
-
-    # remove bolding & italic markup
-    clean_text = clean_text.replace("'''", "")
-    clean_text = clean_text.replace("''", "")
-
-    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
-    try_again = True
-    previous_length = len(clean_text)
-    while try_again:
-        clean_text = info_regex.sub(
-            "", clean_text
-        )  # non-greedy match excluding a nested {
-        if len(clean_text) < previous_length:
-            try_again = True
-        else:
-            try_again = False
-        previous_length = len(clean_text)
-
-    # remove HTML comments
-    clean_text = html_regex.sub("", clean_text)
-
-    # remove Category and File statements
-    clean_text = category_regex.sub("", clean_text)
-    clean_text = file_regex.sub("", clean_text)
-
-    # remove multiple =
-    while "==" in clean_text:
-        clean_text = clean_text.replace("==", "=")
-
-    clean_text = clean_text.replace(". =", ".")
-    clean_text = clean_text.replace(" = ", ". ")
-    clean_text = clean_text.replace("= ", ".")
-    clean_text = clean_text.replace(" =", "")
-
-    # remove refs (non-greedy match)
-    clean_text = ref_regex.sub("", clean_text)
-    clean_text = ref_2_regex.sub("", clean_text)
-
-    # remove additional wikiformatting
-    clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
-    clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
-
-    # change special characters back to normal ones
-    clean_text = clean_text.replace(r"&lt;", "<")
-    clean_text = clean_text.replace(r"&gt;", ">")
-    clean_text = clean_text.replace(r"&quot;", '"')
-    clean_text = clean_text.replace(r"&amp;nbsp;", " ")
-    clean_text = clean_text.replace(r"&amp;", "&")
-
-    # remove multiple spaces
-    while "  " in clean_text:
-        clean_text = clean_text.replace("  ", " ")
-
-    return clean_text.strip()
-
-
-def _remove_links(clean_text, wp_to_id):
-    # read the text char by char to get the right offsets for the interwiki links
-    entities = []
-    final_text = ""
-    open_read = 0
-    reading_text = True
-    reading_entity = False
-    reading_mention = False
-    reading_special_case = False
-    entity_buffer = ""
-    mention_buffer = ""
-    for index, letter in enumerate(clean_text):
-        if letter == "[":
-            open_read += 1
-        elif letter == "]":
-            open_read -= 1
-        elif letter == "|":
-            if reading_text:
-                final_text += letter
-            # switch from reading entity to mention in the [[entity|mention]] pattern
-            elif reading_entity:
-                reading_text = False
-                reading_entity = False
-                reading_mention = True
-            else:
-                reading_special_case = True
-        else:
-            if reading_entity:
-                entity_buffer += letter
-            elif reading_mention:
-                mention_buffer += letter
-            elif reading_text:
-                final_text += letter
-            else:
-                raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
-
-        if open_read > 2:
-            reading_special_case = True
-
-        if open_read == 2 and reading_text:
-            reading_text = False
-            reading_entity = True
-            reading_mention = False
-
-        # we just finished reading an entity
-        if open_read == 0 and not reading_text:
-            if "#" in entity_buffer or entity_buffer.startswith(":"):
-                reading_special_case = True
-            # Ignore cases with nested structures like File: handles etc
-            if not reading_special_case:
-                if not mention_buffer:
-                    mention_buffer = entity_buffer
-                start = len(final_text)
-                end = start + len(mention_buffer)
-                qid = wp_to_id.get(entity_buffer, None)
-                if qid:
-                    entities.append((mention_buffer, qid, start, end))
-                final_text += mention_buffer
-
-            entity_buffer = ""
-            mention_buffer = ""
-
-            reading_text = True
-            reading_entity = False
-            reading_mention = False
-            reading_special_case = False
-    return final_text, entities
-
-
-def _write_training_description(outputfile, qid, description):
-    if description is not None:
-        line = str(qid) + "|" + description + "\n"
-        outputfile.write(line)
-
-
-def _write_training_entities(outputfile, article_id, clean_text, entities):
-    entities_data = [
-        {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
-        for ent in entities
-    ]
-    line = (
-        json.dumps(
-            {
-                "article_id": article_id,
-                "clean_text": clean_text,
-                "entities": entities_data,
-            },
-            ensure_ascii=False,
-        )
-        + "\n"
-    )
-    outputfile.write(line)
-
-
-def read_training_indices(entity_file_path):
-    """ This method creates two lists of indices into the training file: one with indices for the
-     training examples, and one for the dev examples."""
-    train_indices = []
-    dev_indices = []
-
-    with entity_file_path.open("r", encoding="utf8") as file:
-        for i, line in enumerate(file):
-            example = json.loads(line)
-            article_id = example["article_id"]
-            clean_text = example["clean_text"]
-
-            if is_valid_article(clean_text):
-                if is_dev(article_id):
-                    dev_indices.append(i)
-                else:
-                    train_indices.append(i)
-
-    return train_indices, dev_indices
-
-
-def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None):
-    """ This method provides training/dev examples that correspond to the entity annotations found by the nlp object.
-     For training, it will include both positive and negative examples by using the candidate generator from the kb.
-     For testing (kb=None), it will include all positive examples only."""
-    if not labels_discard:
-        labels_discard = []
-
-    texts = []
-    entities_list = []
-
-    with entity_file_path.open("r", encoding="utf8") as file:
-        for i, line in enumerate(file):
-            if i in line_ids:
-                example = json.loads(line)
-                article_id = example["article_id"]
-                clean_text = example["clean_text"]
-                entities = example["entities"]
-
-                if dev != is_dev(article_id) or not is_valid_article(clean_text):
-                    continue
-
-                texts.append(clean_text)
-                entities_list.append(entities)
-
-    docs = nlp.pipe(texts, batch_size=50)
-
-    for doc, entities in zip(docs, entities_list):
-        gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
-        if gold and len(gold.links) > 0:
-            yield doc, gold
-
-
-def _get_gold_parse(doc, entities, dev, kb, labels_discard):
-    gold_entities = {}
-    tagged_ent_positions = {
-        (ent.start_char, ent.end_char): ent
-        for ent in doc.ents
-        if ent.label_ not in labels_discard
-    }
-
-    for entity in entities:
-        entity_id = entity["entity"]
-        alias = entity["alias"]
-        start = entity["start"]
-        end = entity["end"]
-
-        candidate_ids = []
-        if kb and not dev:
-            candidates = kb.get_candidates(alias)
-            candidate_ids = [cand.entity_ for cand in candidates]
-
-        tagged_ent = tagged_ent_positions.get((start, end), None)
-        if tagged_ent:
-            # TODO: check that alias == doc.text[start:end]
-            should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
-                tagged_ent.sent.text
-            )
-
-            if should_add_ent:
-                value_by_id = {entity_id: 1.0}
-                if not dev:
-                    random.shuffle(candidate_ids)
-                    value_by_id.update(
-                        {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
-                    )
-                gold_entities[(start, end)] = value_by_id
-
-    return GoldParse(doc, links=gold_entities)
-
-
-def is_dev(article_id):
-    if not article_id:
-        return False
-    return article_id.endswith("3")
-
-
-def is_valid_article(doc_text):
-    # custom length cut-off
-    return 10 < len(doc_text) < 30000
-
-
-def is_valid_sentence(sent_text):
-    if not 10 < len(sent_text) < 3000:
-        # custom length cut-off
-        return False
-
-    if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
-        # remove 'enumeration' sentences (occurs often on Wikipedia)
-        return False
-
-    return True
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@ -88,8 +88,8 @@ def read_text(bz2_loc, n=10000):
                break


-def get_matches(tokenizer, phrases, texts, max_length=6):
-    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
+def get_matches(tokenizer, phrases, texts):
+    matcher = PhraseMatcher(tokenizer.vocab)
    matcher.add("Phrase", None, *phrases)
    for text in texts:
        doc = tokenizer(text)
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@ -1,15 +1,15 @@
 #!/usr/bin/env python
 # coding: utf8

-"""Example of defining and (pre)training spaCy's knowledge base,
+"""Example of defining a knowledge base in spaCy,
 which is needed to implement entity linking functionality.

 For more details, see the documentation:
 * Knowledge base: https://spacy.io/api/kb
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking

-Compatible with: spaCy v2.2.3
-Last tested with: v2.2.3
+Compatible with: spaCy v2.2.4
+Last tested with: v2.2.4
 """
 from __future__ import unicode_literals, print_function

@ -20,24 +20,18 @@ from spacy.vocab import Vocab
 import spacy
 from spacy.kb import KnowledgeBase

-from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-

 # Q2146908 (Russ Cochran): American golfer
 # Q7381115 (Russ Cochran): publisher
 ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}

-INPUT_DIM = 300  # dimension of pretrained input vectors
-DESC_WIDTH = 64  # dimension of output entity vectors
-

@plac.annotations(
    model=("Model name, should have pretrained word embeddings", "positional", None, str),
    output_dir=("Optional output directory", "option", "o", Path),
-    n_iter=("Number of training iterations", "option", "n", int),
 )
-def main(model=None, output_dir=None, n_iter=50):
-    """Load the model, create the KB and pretrain the entity encodings.
+def main(model=None, output_dir=None):
+    """Load the model and create the KB with pre-defined entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

@ -51,33 +45,23 @@ def main(model=None, output_dir=None, n_iter=50):
            " cf. https://spacy.io/usage/models#languages."
        )

-    kb = KnowledgeBase(vocab=nlp.vocab)
+    # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
+    # For simplicity, we'll just use the original vector dimension here instead.
+    vectors_dim = nlp.vocab.vectors.shape[1]
+    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)

    # set up the data
    entity_ids = []
-    descriptions = []
+    descr_embeddings = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
-        descriptions.append(desc)
+        descr_embeddings.append(nlp(desc).vector)
        freqs.append(freq)

-    # training entity description encodings
-    # this part can easily be replaced with a custom entity encoder
-    encoder = EntityEncoder(
-        nlp=nlp,
-        input_dim=INPUT_DIM,
-        desc_width=DESC_WIDTH,
-        epochs=n_iter,
-    )
-    encoder.train(description_list=descriptions, to_print=True)
-
-    # get the pretrained entity vectors
-    embeddings = encoder.apply_encoder(descriptions)
-
    # set the entities, can also be done by calling `kb.add_entity` for each entity
-    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)
+    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
@ -113,8 +97,8 @@ def main(model=None, output_dir=None, n_iter=50):
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
-        _print_kb(kb2)
        print()
+        _print_kb(kb2)


 def _print_kb(kb):
@ -126,6 +110,5 @@ if __name__ == "__main__":
    plac.call(main)

    # Expected output:
-
    # 2 kb entities: ['Q2146908', 'Q7381115']
    # 1 kb aliases: ['Russ Cochran']
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -1,15 +1,15 @@
 #!/usr/bin/env python
 # coding: utf8

-"""Example of training spaCy's entity linker, starting off with an
-existing model and a pre-defined knowledge base.
+"""Example of training spaCy's entity linker, starting off with a predefined
+knowledge base and corresponding vocab, and a blank English model.

 For more details, see the documentation:
 * Training: https://spacy.io/usage/training
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking

-Compatible with: spaCy v2.2.3
-Last tested with: v2.2.3
+Compatible with: spaCy v2.2.4
+Last tested with: v2.2.4
 """
 from __future__ import unicode_literals, print_function

@ -17,13 +17,11 @@ import plac
 import random
 from pathlib import Path

-from spacy.symbols import PERSON
 from spacy.vocab import Vocab

 import spacy
 from spacy.kb import KnowledgeBase
 from spacy.pipeline import EntityRuler
-from spacy.tokens import Span
 from spacy.util import minibatch, compounding


--- a/netlify.toml
+++ b/netlify.toml
@ -7,42 +7,42 @@ redirects = [
    {from = "https://alpha.spacy.io/*", to = "https://spacy.io", force = true},
    {from = "http://alpha.spacy.io/*", to = "https://spacy.io", force = true},
    # Old demos
-    {from = "/demos/*", to = "https://explosion.ai/demos/:splat"},
+    {from = "/demos/*", to = "https://explosion.ai/demos/:splat", force = true},
    # Old blog
-    {from = "/blog/*", to = "https://explosion.ai/blog/:splat"},
-    {from = "/feed", to = "https://explosion.ai/feed"},
-    {from = "/feed.xml", to = "https://explosion.ai/feed"},
+    {from = "/blog/*", to = "https://explosion.ai/blog/:splat", force = true},
+    {from = "/feed", to = "https://explosion.ai/feed", force = true},
+    {from = "/feed.xml", to = "https://explosion.ai/feed", force = true},
    # Old documentation pages (1.x)
-    {from = "/docs/usage/processing-text", to = "/usage/linguistic-features"},
-    {from = "/docs/usage/deep-learning", to = "/usage/training"},
-    {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging"},
-    {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse"},
-    {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities"},
-    {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity"},
-    {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization"},
-    {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines"},
-    {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines"},
-    {from = "/docs/usage/training-ner", to = "/usage/training#ner"},
-    {from = "/docs/usage/tutorials", to = "/usage/examples"},
-    {from = "/docs/usage/data-model", to = "/api"},
-    {from = "/docs/usage/cli", to = "/api/cli"},
-    {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
-    {from = "/docs/api/language-models", to = "/usage/models#languages"},
-    {from = "/docs/api/spacy", to = "/docs/api/top-level"},
-    {from = "/docs/api/displacy", to = "/api/top-level#displacy"},
-    {from = "/docs/api/util", to = "/api/top-level#util"},
-    {from = "/docs/api/features", to = "/models/#architecture"},
-    {from = "/docs/api/philosophy", to = "/usage/spacy-101"},
-    {from = "/docs/usage/showcase", to = "/universe"},
-    {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom"},
-    {from = "/tutorials", to = "/usage/examples"},
+    {from = "/docs/usage/processing-text", to = "/usage/linguistic-features", force = true},
+    {from = "/docs/usage/deep-learning", to = "/usage/training", force = true},
+    {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging", force = true},
+    {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse", force = true},
+    {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities", force = true},
+    {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity", force = true},
+    {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
+    {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
+    {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
+    {from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
+    {from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
+    {from = "/docs/usage/data-model", to = "/api", force = true},
+    {from = "/docs/usage/cli", to = "/api/cli", force = true},
+    {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true},
+    {from = "/docs/api/language-models", to = "/usage/models#languages", force = true},
+    {from = "/docs/api/spacy", to = "/docs/api/top-level", force = true},
+    {from = "/docs/api/displacy", to = "/api/top-level#displacy", force = true},
+    {from = "/docs/api/util", to = "/api/top-level#util", force = true},
+    {from = "/docs/api/features", to = "/models/#architecture", force = true},
+    {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
+    {from = "/docs/usage/showcase", to = "/universe", force = true},
+    {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
+    {from = "/tutorials", to = "/usage/examples", force = true},
    # Rewrite all other docs pages to /
    {from = "/docs/*", to = "/:splat"},
    # Updated documentation pages
-    {from = "/usage/resources", to = "/universe"},
-    {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
-    {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"},
-    {from = "/models/comparison", to = "/models"},
+    {from = "/usage/resources", to = "/universe", force = true},
+    {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true},
+    {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching", force = true},
+    {from = "/models/comparison", to = "/models", force = true},
    {from = "/api/#section-cython", to = "/api/cython", force = true},
    {from = "/api/#cython", to = "/api/cython", force = true},
    {from = "/api/sentencesegmenter", to="/api/sentencizer"},
--- a/setup.cfg
+++ b/setup.cfg
@ -30,7 +30,7 @@ zip_safe = false
 include_package_data = true
 scripts =
    bin/spacy
-python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
+python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
 setup_requires =
    wheel
    cython>=0.25
@ -61,17 +61,21 @@ install_requires =
 lookups =
    spacy_lookups_data>=0.0.5,<0.2.0
 cuda =
-    cupy>=5.0.0b4
+    cupy>=5.0.0b4,<9.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4
+    cupy-cuda80>=5.0.0b4,<9.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4
+    cupy-cuda90>=5.0.0b4,<9.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4
+    cupy-cuda91>=5.0.0b4,<9.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4
+    cupy-cuda92>=5.0.0b4,<9.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4
+    cupy-cuda100>=5.0.0b4,<9.0.0
+cuda101 =
+    cupy-cuda101>=5.0.0b4,<9.0.0
+cuda102 =
+    cupy-cuda102>=5.0.0b4,<9.0.0
 # Language tokenizers with external dependencies
 ja =
    fugashi>=0.1.3
--- a/setup.py
+++ b/setup.py
@ -31,7 +31,6 @@ PACKAGES = find_packages()


 MOD_NAMES = [
-    "spacy._align",
    "spacy.parts_of_speech",
    "spacy.strings",
    "spacy.lexeme",
--- a/spacy/init.py
+++ b/spacy/init.py
@ -13,7 +13,7 @@ from . import pipeline
 from .cli.info import info as cli_info
 from .glossary import explain
 from .about import __version__
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings
 from . import util
 from .util import registry
 from .language import component
@ -26,7 +26,7 @@ if sys.maxunicode == 65535:
 def load(name, **overrides):
    depr_path = overrides.get("path")
    if depr_path not in (True, False, None):
-        deprecation_warning(Warnings.W001.format(path=depr_path))
+        warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
    return util.load_model(name, **overrides)


--- a/spacy/_align.pyx
+++ b/spacy/_align.pyx
@ -1,255 +0,0 @@
-# cython: infer_types=True
-'''Do Levenshtein alignment, for evaluation of tokenized input.
-
-Random notes:
-
-  r i n g
-  0 1 2 3 4
-r 1 0 1 2 3
-a 2 1 1 2 3
-n 3 2 2 1 2
-g 4 3 3 2 1
-
-0,0: (1,1)=min(0+0,1+1,1+1)=0 S
-1,0: (2,1)=min(1+1,0+1,2+1)=1 D
-2,0: (3,1)=min(2+1,3+1,1+1)=2 D
-3,0: (4,1)=min(3+1,4+1,2+1)=3 D
-0,1: (1,2)=min(1+1,2+1,0+1)=1 D
-1,1: (2,2)=min(0+1,1+1,1+1)=1 S
-2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
-3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
-0,2: (1,3)=min(2+1,3+1,1+1)=2 I
-1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
-2,2: (3,3)
-3,2: (4,3)
-At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
-
-We know the costs to transition:
-
-S[:i]   -> T[:j]   (at D[i,j])
-S[:i+1] -> T[:j]   (at D[i+1,j])
-S[:i]   -> T[:j+1] (at D[i,j+1])
-    
-Further, now we can transform:
-S[:i+1] -> S[:i] (DEL) for 1,
-T[:j+1] -> T[:j] (INS) for 1.
-S[i+1]  -> T[j+1] (SUB) for 0 or 1
-
-Therefore we have the costs:
-SUB: Cost(S[:i]->T[:j])   + Cost(S[i]->S[j])
-i.e. D[i, j] + S[i+1] != T[j+1]
-INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
-i.e. D[i+1,j] + 1
-DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) 
-i.e. D[i,j+1] + 1
-
-    Source string S has length m, with index i
-    Target string T has length n, with index j
-
-    Output two alignment vectors: i2j (length m) and j2i (length n)
-    # function LevenshteinDistance(char s[1..m], char t[1..n]):
-    # for all i and j, d[i,j] will hold the Levenshtein distance between
-    # the first i characters of s and the first j characters of t
-    # note that d has (m+1)*(n+1) values
-    # set each element in d to zero
-    ring rang
-      - r i n g
-    - 0 0 0 0 0
-    r 0 0 0 0 0
-    a 0 0 0 0 0
-    n 0 0 0 0 0
-    g 0 0 0 0 0
-
-    # source prefixes can be transformed into empty string by
-    # dropping all characters
-    # d[i, 0] := i
-    ring rang
-      - r i n g
-    - 0 0 0 0 0
-    r 1 0 0 0 0
-    a 2 0 0 0 0
-    n 3 0 0 0 0
-    g 4 0 0 0 0
-
-    # target prefixes can be reached from empty source prefix
-    # by inserting every character
-    # d[0, j] := j
-      - r i n g
-    - 0 1 2 3 4
-    r 1 0 0 0 0
-    a 2 0 0 0 0
-    n 3 0 0 0 0
-    g 4 0 0 0 0
-
-'''
-from __future__ import unicode_literals
-from libc.stdint cimport uint32_t
-import numpy
-cimport numpy as np
-from .compat import unicode_
-from murmurhash.mrmr cimport hash32
-
-
-def align(S, T):
-    cdef int m = len(S)
-    cdef int n = len(T)
-    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
-    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
-    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
-
-    cdef np.ndarray S_arr = _convert_sequence(S)
-    cdef np.ndarray T_arr = _convert_sequence(T)
-
-    fill_matrix(<int*>matrix.data,
-        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
-    fill_i2j(i2j, matrix)
-    fill_j2i(j2i, matrix)
-    for i in range(i2j.shape[0]):
-        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
-            i2j[i] = -1
-    for j in range(j2i.shape[0]):
-        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
-            j2i[j] = -1
-    return matrix[-1,-1], i2j, j2i, matrix
-
-
-def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
-    '''Let's say we had:
-
-    Guess: [aa bb cc dd]
-    Truth: [aa bbcc dd]
-    i2j: [0, None, -2, 2]
-    j2i: [0, -2, 3]
-
-    We want:
-
-    i2j_multi: {1: 1, 2: 1}
-    j2i_multi: {}
-    '''
-    i2j_miss = _get_regions(i2j, i_lengths)
-    j2i_miss = _get_regions(j2i, j_lengths)
-
-    i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
-    return i2j_multi, j2i_multi
-
-
-def _get_regions(alignment, lengths):
-    regions = {}
-    start = None
-    offset = 0
-    for i in range(len(alignment)):
-        if alignment[i] < 0:
-            if start is None:
-                start = offset
-                regions.setdefault(start, [])
-            regions[start].append(i)
-        else:
-            start = None
-        offset += lengths[i]
-    return regions
-
-
-def _get_mapping(miss1, miss2, lengths1, lengths2):
-    i2j = {}
-    j2i = {}
-    for start, region1 in miss1.items():
-        if not region1 or start not in miss2:
-            continue
-        region2 = miss2[start]
-        if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
-            j = region2.pop(0)
-            buff = []
-            # Consume tokens from region 1, until we meet the length of the
-            # first token in region2. If we do, align the tokens. If
-            # we exceed the length, break.
-            while region1:
-                buff.append(region1.pop(0))
-                if sum(lengths1[i] for i in buff) == lengths2[j]:
-                    for i in buff:
-                        i2j[i] = j
-                    j2i[j] = buff[-1]
-                    j += 1
-                    buff = []
-                elif sum(lengths1[i] for i in buff) > lengths2[j]:
-                    break
-            else:
-                if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
-                    for i in buff:
-                        i2j[i] = j
-                    j2i[j] = buff[-1]
-    return i2j, j2i
-
-
-def _convert_sequence(seq):
-    if isinstance(seq, numpy.ndarray):
-        return numpy.ascontiguousarray(seq, dtype='uint32_t')
-    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
-    cdef bytes item_bytes
-    for i, item in enumerate(seq):
-        if item == "``":
-            item = '"'
-        elif item == "''":
-            item = '"'
-        if isinstance(item, unicode):
-            item_bytes = item.encode('utf8')
-        else:
-            item_bytes = item
-        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
-    return output
-
-
-cdef void fill_matrix(int* D, 
-        const int* S, int m, const int* T, int n) nogil:
-    m1 = m+1
-    n1 = n+1
-    for i in range(m1*n1):
-        D[i] = 0
- 
-    for i in range(m1):
-        D[i*n1] = i
- 
-    for j in range(n1):
-        D[j] = j
- 
-    cdef int sub_cost, ins_cost, del_cost
-    for j in range(n):
-        for i in range(m):
-            i_j = i*n1 + j
-            i1_j1 = (i+1)*n1 + j+1
-            i1_j = (i+1)*n1 + j
-            i_j1 = i*n1 + j+1
-            if S[i] != T[j]:
-                sub_cost = D[i_j] + 1
-            else:
-                sub_cost = D[i_j]
-            del_cost = D[i_j1] + 1
-            ins_cost = D[i1_j] + 1
-            best = min(min(sub_cost, ins_cost), del_cost)
-            D[i1_j1] = best
-
-
-cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
-    j = D.shape[1]-2
-    cdef int i = D.shape[0]-2
-    while i >= 0:
-        while D[i+1, j] < D[i+1, j+1]:
-            j -= 1
-        if D[i, j+1] < D[i+1, j+1]:
-            i2j[i] = -1
-        else:
-            i2j[i] = j
-            j -= 1
-        i -= 1
-
-cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
-    i = D.shape[0]-2
-    cdef int j = D.shape[1]-2
-    while j >= 0:
-        while D[i, j+1] < D[i+1, j+1]:
-            i -= 1
-        if D[i+1, j] < D[i+1, j+1]:
-            j2i[j] = -1
-        else:
-            j2i[j] = i
-            i -= 1
-        j -= 1
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 import numpy
+import warnings
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.t2v import Pooling, sum_pool, mean_pool
@ -22,7 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
 import thinc.extra.load_nlp

 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
-from .errors import Errors, user_warning, Warnings
+from .errors import Errors, Warnings
 from . import util
 from . import ml as new_ml
 from .ml import _legacy_tok2vec
@ -283,13 +284,13 @@ def link_vectors_to_models(vocab):
    if vectors.name is None:
        vectors.name = VECTORS_KEY
        if vectors.data.size != 0:
-            user_warning(Warnings.W020.format(shape=vectors.data.shape))
+            warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
    ops = Model.ops
    for word in vocab:
        if word.orth in vectors.key2row:
            word.rank = vectors.key2row[word.orth]
        else:
-            word.rank = 0
+            word.rank = util.OOV_RANK
    data = ops.asarray(vectors.data)
    # Set an entry here, so that vectors are accessed by StaticVectors
    # (unideal, I know)
@ -299,7 +300,7 @@ def link_vectors_to_models(vocab):
            # This is a hack to avoid the problem in #3853.
            old_name = vectors.name
            new_name = vectors.name + "_%d" % data.shape[0]
-            user_warning(Warnings.W019.format(old=old_name, new=new_name))
+            warnings.warn(Warnings.W019.format(old=old_name, new=new_name))
            vectors.name = new_name
            key = (ops.device, vectors.name)
    thinc.extra.load_nlp.VECTORS[key] = data
@ -693,9 +694,11 @@ def build_text_classifier(nr_class, width=64, **cfg):
        )

        linear_model = build_bow_text_classifier(
-            nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
+            nr_class,
+            ngram_size=cfg.get("ngram_size", 1),
+            exclusive_classes=cfg.get("exclusive_classes", False),
        )
-        if cfg.get("exclusive_classes"):
+        if cfg.get("exclusive_classes", False):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
            output_layer = (
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.2.4.dev0"
+__version__ = "2.2.4"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/analysis.py
+++ b/spacy/analysis.py
@ -1,11 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals

+import warnings
+
 from collections import OrderedDict
 from wasabi import Printer

 from .tokens import Doc, Token, Span
-from .errors import Errors, Warnings, user_warning
+from .errors import Errors, Warnings


 def analyze_pipes(pipeline, name, pipe, index, warn=True):
@ -34,7 +36,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
        if not fulfilled:
            problems.append(annot)
            if warn:
-                user_warning(Warnings.W025.format(name=name, attr=annot))
+                warnings.warn(Warnings.W025.format(name=name, attr=annot))
    return problems


--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -94,3 +94,4 @@ cdef enum attr_id_t:
    ENT_ID = symbols.ENT_ID

    IDX
+    SENT_END
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -88,6 +88,7 @@ IDS = {
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
+    "SENT_END": SENT_END,
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000


@plac.annotations(
+    # fmt: off
    lang=("model language", "positional", None, str),
    train_path=("location of JSON-formatted training data", "positional", None, Path),
    dev_path=("location of JSON-formatted development data", "positional", None, Path),
    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
    base_model=("name of model to update (optional)", "option", "b", str),
-    pipeline=(
-        "Comma-separated names of pipeline components to train",
-        "option",
-        "p",
-        str,
-    ),
+    pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
    verbose=("Print additional information and explanations", "flag", "V", bool),
    no_format=("Don't pretty-print the results", "flag", "NF", bool),
+    # fmt: on
 )
 def debug_data(
    lang,
@ -111,9 +108,11 @@ def debug_data(
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
-    gold_train_data = _compile_gold(train_docs, pipeline)
-    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
-    gold_dev_data = _compile_gold(dev_docs, pipeline)
+    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
+    gold_train_unpreprocessed_data = _compile_gold(
+        train_docs_unpreprocessed, pipeline, nlp
+    )
+    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
@ -185,6 +184,16 @@ def debug_data(
                nlp.vocab.vectors_length,
            )
        )
+        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
+        msg.warn(
+            "{} words in training data without vectors ({:0.2f}%)".format(
+                n_missing_vectors,
+                n_missing_vectors / gold_train_data["n_words"],
+            ),
+        )
+        msg.text(
+            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
+        )
    else:
        msg.info("No word vectors present in the model")

@ -235,13 +244,17 @@ def debug_data(

        if gold_train_data["ws_ents"]:
            msg.fail(
-                "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
+                "{} invalid whitespace entity span(s)".format(
+                    gold_train_data["ws_ents"]
+                )
            )
            has_ws_ents_error = True

        if gold_train_data["punct_ents"]:
            msg.warn(
-                "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
+                "{} entity span(s) with punctuation".format(
+                    gold_train_data["punct_ents"]
+                )
            )
            has_punct_ents_warning = True

@ -561,7 +574,7 @@ def _load_file(file_path, msg):
    )


-def _compile_gold(train_docs, pipeline):
+def _compile_gold(train_docs, pipeline, nlp):
    data = {
        "ner": Counter(),
        "cats": Counter(),
@ -573,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
        "punct_ents": 0,
        "n_words": 0,
        "n_misaligned_words": 0,
+        "words_missing_vectors": Counter(),
        "n_sents": 0,
        "n_nonproj": 0,
        "n_cycles": 0,
@ -585,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
        data["n_words"] += len(valid_words)
        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
        data["texts"].add(doc.text)
+        if len(nlp.vocab.vectors):
+            for word in valid_words:
+                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
+                    data["words_missing_vectors"].update([word])
        if "ner" in pipeline:
            for i, label in enumerate(gold.ner):
                if label is None:
@ -592,7 +610,13 @@ def _compile_gold(train_docs, pipeline):
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
                    # "Illegal" whitespace entity
                    data["ws_ents"] += 1
-                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
+                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
+                    ".",
+                    "'",
+                    "!",
+                    "?",
+                    ",",
+                ]:
                    # punctuation entity: could be replaced by whitespace when training with noise,
                    # so add a warning to alert the user to this unexpected side effect.
                    data["punct_ents"] += 1
@ -629,7 +653,11 @@ def _format_labels(labels, counts=False):
 def _get_examples_without_label(data, label):
    count = 0
    for doc, gold in data:
-        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
+        labels = [
+            label.split("-")[1]
+            for label in gold.ner
+            if label is not None and label not in ("O", "-")
+        ]
        if label not in labels:
            count += 1
    return count
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals, division, print_function

 import plac
+import spacy
 from timeit import default_timer as timer
 from wasabi import msg

@ -43,7 +44,10 @@ def evaluate(
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
-    nlp = util.load_model(model)
+    if model.startswith("blank:"):
+        nlp = spacy.blank(model.replace("blank:", ""))
+    else:
+        nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_docs, verbose=False)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -12,11 +12,12 @@ import tarfile
 import gzip
 import zipfile
 import srsly
+import warnings
 from wasabi import msg

 from ..vectors import Vectors
-from ..errors import Errors, Warnings, user_warning
-from ..util import ensure_path, get_lang_class
+from ..errors import Errors, Warnings
+from ..util import ensure_path, get_lang_class, OOV_RANK

 try:
    import ftfy
@ -34,6 +35,12 @@ DEFAULT_OOV_PROB = -20
    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
+    truncate_vectors=(
+        "Optional number of vectors to truncate to when reading in vectors file",
+        "option",
+        "t",
+        int,
+    ),
    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
    vectors_name=(
        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
@ -50,6 +57,7 @@ def init_model(
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
+    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
@ -87,7 +95,7 @@ def init_model(
        nlp = create_model(lang, lex_attrs, name=model_name)
    msg.good("Successfully created model")
    if vectors_loc is not None:
-        add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
+        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
@ -148,7 +156,7 @@ def create_model(lang, lex_attrs, name=None):
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
-        lexeme.rank = 0
+        lexeme.rank = OOV_RANK
    lex_added = 0
    for attrs in lex_attrs:
        if "settings" in attrs:
@ -168,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
    return nlp


-def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
+def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -178,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
    else:
        if vectors_loc:
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
-                vectors_data, vector_keys = read_vectors(vectors_loc)
+                vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
            msg.good("Loaded vectors from {}".format(vectors_loc))
        else:
            vectors_data, vector_keys = (None, None)
@ -198,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
        nlp.vocab.prune_vectors(prune_vectors)


-def read_vectors(vectors_loc):
+def read_vectors(vectors_loc, truncate_vectors=0):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
+    if truncate_vectors >= 1:
+        shape = (truncate_vectors, shape[1])
    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
@ -211,6 +221,8 @@ def read_vectors(vectors_loc):
            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
+        if i == truncate_vectors - 1:
+            break
    return vectors_data, vectors_keys


@ -246,7 +258,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
 def read_clusters(clusters_loc):
    clusters = {}
    if ftfy is None:
-        user_warning(Warnings.W004)
+        warnings.warn(Warnings.W004)
    with clusters_loc.open() as f:
        for line in tqdm(f):
            try:
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -225,7 +225,9 @@ def train(
                            exits=1,
                        )
                msg.text("Extending component from base model '{}'".format(pipe))
-        disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
+        disabled_pipes = nlp.disable_pipes(
+            [p for p in nlp.pipe_names if p not in pipeline]
+        )
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
@ -361,7 +363,7 @@ def train(
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
-                    "exclusive classes, provide '--textcat_positive_label' for "
+                    "exclusive classes, provide '--textcat-positive-label' for "
                    "an evaluation on the positive class."
                )
            msg.text(
@ -415,10 +417,10 @@ def train(
                            losses=losses,
                        )
                    except ValueError as e:
-                        msg.warn("Error during training")
+                        err = "Error during training"
                        if init_tok2vec:
-                            msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
-                        msg.fail("Original error message: {}".format(e), exits=1)
+                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
+                        msg.fail(err, "Original error message: {}".format(e), exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
@ -452,22 +454,25 @@ def train(
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
-                        with Model.use_device("cpu"):
-                            nlp_loaded = util.load_model_from_path(epoch_model_path)
-                            for name, component in nlp_loaded.pipeline:
-                                if hasattr(component, "cfg"):
-                                    component.cfg["beam_width"] = beam_width
-                            dev_docs = list(
-                                corpus.dev_docs(
-                                    nlp_loaded,
-                                    gold_preproc=gold_preproc,
-                                    ignore_misaligned=True,
+                        # Only evaluate on CPU in the first iteration (for
+                        # timing) if GPU is enabled
+                        if i == 0:
+                            with Model.use_device("cpu"):
+                                nlp_loaded = util.load_model_from_path(epoch_model_path)
+                                for name, component in nlp_loaded.pipeline:
+                                    if hasattr(component, "cfg"):
+                                        component.cfg["beam_width"] = beam_width
+                                dev_docs = list(
+                                    corpus.dev_docs(
+                                        nlp_loaded,
+                                        gold_preproc=gold_preproc,
+                                        ignore_misaligned=True,
+                                    )
                                )
-                            )
-                            start_time = timer()
-                            scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
-                            end_time = timer()
-                            cpu_wps = nwords / (end_time - start_time)
+                                start_time = timer()
+                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
+                                end_time = timer()
+                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

@ -546,7 +551,11 @@ def train(
                        )
                        break
    except Exception as e:
-        msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e))
+        msg.warn(
+            "Aborting and saving the final best model. "
+            "Encountered exception: {}".format(e),
+            exits=1,
+        )
    finally:
        best_pipes = nlp.pipe_names
        if disabled_pipes:
@ -561,15 +570,25 @@ def train(
            final_meta.setdefault("speed", {})
            final_meta["speed"].setdefault("cpu", None)
            final_meta["speed"].setdefault("gpu", None)
+            meta.setdefault("speed", {})
+            meta["speed"].setdefault("cpu", None)
+            meta["speed"].setdefault("gpu", None)
            # combine cpu and gpu speeds with the base model speeds
            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
-                speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
+                speed = _get_total_speed(
+                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
+                )
                final_meta["speed"]["cpu"] = speed
            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
-                speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
+                speed = _get_total_speed(
+                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
+                )
                final_meta["speed"]["gpu"] = speed
            # if there were no speeds to update, overwrite with meta
-            if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None:
+            if (
+                final_meta["speed"]["cpu"] is None
+                and final_meta["speed"]["gpu"] is None
+            ):
                final_meta["speed"].update(meta["speed"])
            # note: beam speeds are not combined with the base model
            if has_beam_widths:
@ -661,6 +680,8 @@ def _find_best(experiment_dir, component):
        if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
            accs = srsly.read_json(epoch_model / "accuracy.json")
            scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
+            # remove per_type dicts from score list for max() comparison
+            scores = [score for score in scores if isinstance(score, float)]
            accuracies.append((scores, epoch_model))
    if accuracies:
        return max(accuracies)[1]
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -7,10 +7,12 @@ USAGE: https://spacy.io/usage/visualizers
 """
 from __future__ import unicode_literals

+import warnings
+
 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc, Span
 from ..compat import b_to_str
-from ..errors import Errors, Warnings, user_warning
+from ..errors import Errors, Warnings
 from ..util import is_in_jupyter


@ -89,7 +91,7 @@ def serve(
    from wsgiref import simple_server

    if is_in_jupyter():
-        user_warning(Warnings.W011)
+        warnings.warn(Warnings.W011)

    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server(host, port, app)
@ -119,7 +121,7 @@ def parse_deps(orig_doc, options={}):
    """
    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
    if not doc.is_parsed:
-        user_warning(Warnings.W005)
+        warnings.warn(Warnings.W005)
    if options.get("collapse_phrases", False):
        with doc.retokenize() as retokenizer:
            for np in list(doc.noun_chunks):
@ -146,9 +148,14 @@ def parse_deps(orig_doc, options={}):
                retokenizer.merge(span, attrs=attrs)
    fine_grained = options.get("fine_grained")
    add_lemma = options.get("add_lemma")
-    words = [{"text": w.text,
-              "tag": w.tag_ if fine_grained else w.pos_,
-              "lemma": w.lemma_ if add_lemma else None} for w in doc]
+    words = [
+        {
+            "text": w.text,
+            "tag": w.tag_ if fine_grained else w.pos_,
+            "lemma": w.lemma_ if add_lemma else None,
+        }
+        for w in doc
+    ]

    arcs = []
    for word in doc:
@ -179,7 +186,7 @@ def parse_ents(doc, options={}):
        for ent in doc.ents
    ]
    if not ents:
-        user_warning(Warnings.W006)
+        warnings.warn(Warnings.W006)
    title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
    settings = get_doc_settings(doc)
    return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -3,7 +3,13 @@ from __future__ import unicode_literals

 import uuid

-from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
+from .templates import (
+    TPL_DEP_SVG,
+    TPL_DEP_WORDS,
+    TPL_DEP_WORDS_LEMMA,
+    TPL_DEP_ARCS,
+    TPL_ENTS,
+)
 from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
 from ..util import minify_html, escape_html, registry
 from ..errors import Errors
@ -83,7 +89,10 @@ class DependencyRenderer(object):
        self.width = self.offset_x + len(words) * self.distance
        self.height = self.offset_y + 3 * self.word_spacing
        self.id = render_id
-        words = [self.render_word(w["text"], w["tag"],  w.get("lemma", None), i) for i, w in enumerate(words)]
+        words = [
+            self.render_word(w["text"], w["tag"], w.get("lemma", None), i)
+            for i, w in enumerate(words)
+        ]
        arcs = [
            self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
            for i, a in enumerate(arcs)
@ -101,7 +110,9 @@ class DependencyRenderer(object):
            lang=self.lang,
        )

-    def render_word(self, text, tag, lemma, i,):
+    def render_word(
+        self, text, tag, lemma, i,
+    ):
        """Render individual word.

        text (unicode): Word text.
@ -115,7 +126,9 @@ class DependencyRenderer(object):
            x = self.width - x
        html_text = escape_html(text)
        if lemma is not None:
-            return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
+            return TPL_DEP_WORDS_LEMMA.format(
+                text=html_text, tag=tag, lemma=lemma, x=x, y=y
+            )
        return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)

    def render_arrow(self, label, start, end, direction, i):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,11 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import os
-import warnings
-import inspect
-
-
 def add_codes(err_cls):
    """Add error codes to string messages via class attribute names."""

@ -93,8 +88,7 @@ class Warnings(object):
    W022 = ("Training a new part-of-speech tagger using a model with no "
            "lemmatization rules or data. This means that the trained model "
            "may not be able to lemmatize correctly. If this is intentional "
-            "or the language you're using doesn't have lemmatization data, "
-            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
+            "or the language you're using doesn't have lemmatization data. "
            "If this is surprising, make sure you have the spacy-lookups-data "
            "package installed.")
    W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
@ -110,7 +104,8 @@ class Warnings(object):
    W028 = ("Doc.from_array was called with a vector of type '{type}', "
            "but is expecting one of type 'uint64' instead. This may result "
            "in problems with the vocab further on in the pipeline.")
-
+    W029 = ("Unable to align tokens with entities from character offsets. "
+            "Discarding entity annotation for the text: {text}.")


@add_codes
@ -552,6 +547,14 @@ class Errors(object):
            "array.")
    E191 = ("Invalid head: the head token must be from the same doc as the "
            "token itself.")
+    E192 = ("Unable to resize vectors in place with cupy.")
+    E193 = ("Unable to resize vectors in place if the resized vector dimension "
+            "({new_dim}) is not the same as the current vector dimension "
+            "({curr_dim}).")
+    E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
+    E195 = ("Matcher can be called on {good} only, got {got}.")
+    E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
+            "only be fixed with token.is_sent_start.")


@add_codes
@ -586,64 +589,3 @@ class MatchPatternError(ValueError):

 class AlignmentError(ValueError):
    pass
-
-
-class ModelsWarning(UserWarning):
-    pass
-
-
-WARNINGS = {
-    "user": UserWarning,
-    "deprecation": DeprecationWarning,
-    "models": ModelsWarning,
-}
-
-
-def _get_warn_types(arg):
-    if arg == "":  # don't show any warnings
-        return []
-    if not arg or arg == "all":  # show all available warnings
-        return WARNINGS.keys()
-    return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
-
-
-def _get_warn_excl(arg):
-    if not arg:
-        return []
-    return [w_id.strip() for w_id in arg.split(",")]
-
-
-SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
-SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
-SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
-
-
-def user_warning(message):
-    _warn(message, "user")
-
-
-def deprecation_warning(message):
-    _warn(message, "deprecation")
-
-
-def models_warning(message):
-    _warn(message, "models")
-
-
-def _warn(message, warn_type="user"):
-    """
-    message (unicode): The message to display.
-    category (Warning): The Warning to show.
-    """
-    if message.startswith("["):
-        w_id = message.split("[", 1)[1].split("]", 1)[0]  # get ID from string
-    else:
-        w_id = None
-    ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE
-    if warn_type in SPACY_WARNING_TYPES and not ignore_warning:
-        category = WARNINGS[warn_type]
-        stack = inspect.stack()[-1]
-        with warnings.catch_warnings():
-            if SPACY_WARNING_FILTER:
-                warnings.simplefilter(SPACY_WARNING_FILTER, category)
-            warnings.warn_explicit(message, category, stack[1], stack[2])
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -10,10 +10,11 @@ import shutil
 import itertools
 from pathlib import Path
 import srsly
+import warnings

 from .syntax import nonproj
 from .tokens import Doc, Span
-from .errors import Errors, AlignmentError, user_warning, Warnings
+from .errors import Errors, AlignmentError, Warnings
 from .compat import path2str
 from . import util
 from .util import minibatch, itershuffle
@ -21,7 +22,6 @@ from .util import minibatch, itershuffle
 from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek


-USE_NEW_ALIGN = False
 punct_re = re.compile(r"\W")


@ -73,57 +73,8 @@ def merge_sents(sents):
    return [(m_deps, (m_cats, m_brackets))]


-_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
-
-
 def _normalize_for_alignment(tokens):
-    tokens = [w.replace(" ", "").lower() for w in tokens]
-    output = []
-    for token in tokens:
-        token = token.replace(" ", "").lower()
-        for before, after in _ALIGNMENT_NORM_MAP:
-            token = token.replace(before, after)
-        output.append(token)
-    return output
-
-
-def _align_before_v2_2_2(tokens_a, tokens_b):
-    """Calculate alignment tables between two tokenizations, using the Levenshtein
-    algorithm. The alignment is case-insensitive.
-
-    tokens_a (List[str]): The candidate tokenization.
-    tokens_b (List[str]): The reference tokenization.
-    RETURNS: (tuple): A 5-tuple consisting of the following information:
-      * cost (int): The number of misaligned tokens.
-      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
-        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
-        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
-        it has the value -1.
-      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
-      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
-        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
-        the same token of `tokens_b`.
-      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
-            direction.
-    """
-    from . import _align
-    if tokens_a == tokens_b:
-        alignment = numpy.arange(len(tokens_a))
-        return 0, alignment, alignment, {}, {}
-    tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
-    tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
-    cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
-    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
-                                                        [len(w) for w in tokens_b])
-    for i, j in list(i2j_multi.items()):
-        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
-            i2j[i] = j
-            i2j_multi.pop(i)
-    for j, i in list(j2i_multi.items()):
-        if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
-            j2i[j] = i
-            j2i_multi.pop(j)
-    return cost, i2j, j2i, i2j_multi, j2i_multi
+    return [w.replace(" ", "").lower() for w in tokens]


 def align(tokens_a, tokens_b):
@ -144,8 +95,6 @@ def align(tokens_a, tokens_b):
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
-    if not USE_NEW_ALIGN:
-        return _align_before_v2_2_2(tokens_a, tokens_b)
    tokens_a = _normalize_for_alignment(tokens_a)
    tokens_b = _normalize_for_alignment(tokens_b)
    cost = 0
@ -382,6 +331,8 @@ class GoldCorpus(object):
 def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
    if random.random() >= orth_variant_level:
        return raw, paragraph_tuples
+    raw_orig = str(raw)
+    lower = False
    if random.random() >= 0.5:
        lower = True
        if raw is not None:
@ -442,8 +393,11 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
            ids, words, tags, heads, labels, ner = sent_tuples
            for word in words:
                match_found = False
+                # skip whitespace words
+                if word.isspace():
+                    match_found = True
                # add identical word
-                if word not in variants and raw[raw_idx:].startswith(word):
+                elif word not in variants and raw[raw_idx:].startswith(word):
                    variant_raw += word
                    raw_idx += len(word)
                    match_found = True
@ -458,7 +412,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
                # something went wrong, abort
                # (add a warning message?)
                if not match_found:
-                    return raw, paragraph_tuples
+                    return raw_orig, paragraph_tuples
                # add following whitespace
                while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
                    variant_raw += raw[raw_idx]
@ -560,7 +514,7 @@ def _json_iterate(loc):
        py_raw = file_.read()
    cdef long file_length = len(py_raw)
    if file_length > 2 ** 30:
-        user_warning(Warnings.W027.format(size=file_length))
+        warnings.warn(Warnings.W027.format(size=file_length))

    raw = <char*>py_raw
    cdef int square_depth = 0
@ -700,6 +654,9 @@ cdef class GoldParse:
        # if self.lenght > 0, this is modified latter.
        self.orig_annot = []

+        # temporary doc for aligning entity annotation
+        entdoc = None
+
        # avoid allocating memory if the doc does not contain any tokens
        if self.length > 0:
            if words is None:
@ -722,7 +679,25 @@ cdef class GoldParse:
                entities = [(ent if ent is not None else "-") for ent in entities]
                if not isinstance(entities[0], basestring):
                    # Assume we have entities specified by character offset.
-                    entities = biluo_tags_from_offsets(doc, entities)
+                    # Create a temporary Doc corresponding to provided words
+                    # (to preserve gold tokenization) and text (to preserve
+                    # character offsets).
+                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
+                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
+                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
+                    # There may be some additional whitespace tokens in the
+                    # temporary doc, so check that the annotations align with
+                    # the provided words while building a list of BILUO labels.
+                    entities = []
+                    words_offset = 0
+                    for i in range(len(entdoc_words)):
+                        if words[i + words_offset] == entdoc_words[i]:
+                            entities.append(entdoc_entities[i])
+                        else:
+                            words_offset -= 1
+                    if len(entities) != len(words):
+                        warnings.warn(Warnings.W029.format(text=doc.text))
+                        entities = ["-" for _ in words]

            # These are filled by the tagger/parser/entity recogniser
            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
@ -749,7 +724,8 @@ cdef class GoldParse:
            # If we under-segment, we'll have one predicted word that covers a
            # sequence of gold words.
            # If we "mis-segment", we'll have a sequence of predicted words covering
-            # a sequence of gold words. That's many-to-many -- we don't do that.
+            # a sequence of gold words. That's many-to-many -- we don't do that
+            # except for NER spans where the start and end can be aligned.
            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)

            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
@ -772,7 +748,6 @@ cdef class GoldParse:
                        self.tags[i] = tags[i2j_multi[i]]
                        self.morphology[i] = morphology[i2j_multi[i]]
                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
-                        is_first = i2j_multi[i] != i2j_multi.get(i-1)
                        # Set next word in multi-token span as head, until last
                        if not is_last:
                            self.heads[i] = i+1
@ -782,30 +757,10 @@ cdef class GoldParse:
                            if head_i:
                                self.heads[i] = self.gold_to_cand[head_i]
                            self.labels[i] = deps[i2j_multi[i]]
-                        # Now set NER...This is annoying because if we've split
-                        # got an entity word split into two, we need to adjust the
-                        # BILUO tags. We can't have BB or LL etc.
-                        # Case 1: O -- easy.
                        ner_tag = entities[i2j_multi[i]]
-                        if ner_tag == "O":
-                            self.ner[i] = "O"
-                        # Case 2: U. This has to become a B I* L sequence.
-                        elif ner_tag.startswith("U-"):
-                            if is_first:
-                                self.ner[i] = ner_tag.replace("U-", "B-", 1)
-                            elif is_last:
-                                self.ner[i] = ner_tag.replace("U-", "L-", 1)
-                            else:
-                                self.ner[i] = ner_tag.replace("U-", "I-", 1)
-                        # Case 3: L. If not last, change to I.
-                        elif ner_tag.startswith("L-"):
-                            if is_last:
-                                self.ner[i] = ner_tag
-                            else:
-                                self.ner[i] = ner_tag.replace("L-", "I-", 1)
-                        # Case 4: I. Stays correct
-                        elif ner_tag.startswith("I-"):
-                            self.ner[i] = ner_tag
+                        # Assign O/- for many-to-one O/- NER tags
+                        if ner_tag in ("O", "-"):
+                             self.ner[i] = ner_tag
                else:
                    self.words[i] = words[gold_i]
                    self.tags[i] = tags[gold_i]
@ -816,6 +771,39 @@ cdef class GoldParse:
                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
                    self.labels[i] = deps[gold_i]
                    self.ner[i] = entities[gold_i]
+            # Assign O/- for one-to-many O/- NER tags
+            for j, cand_j in enumerate(self.gold_to_cand):
+                if cand_j is None:
+                    if j in j2i_multi:
+                        i = j2i_multi[j]
+                        ner_tag = entities[j]
+                        if ner_tag in ("O", "-"):
+                            self.ner[i] = ner_tag
+
+            # If there is entity annotation and some tokens remain unaligned,
+            # align all entities at the character level to account for all
+            # possible token misalignments within the entity spans
+            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
+                # If the temporary entdoc wasn't created above, initialize it
+                if not entdoc:
+                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
+                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
+                # Get offsets based on gold words and BILUO entities
+                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
+                aligned_offsets = []
+                aligned_spans = []
+                # Filter offsets to identify those that align with doc tokens
+                for offset in entdoc_offsets:
+                    span = doc.char_span(offset[0], offset[1])
+                    if span and not span.text.isspace():
+                        aligned_offsets.append(offset)
+                        aligned_spans.append(span)
+                # Convert back to BILUO for doc tokens and assign NER for all
+                # aligned spans
+                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
+                for span in aligned_spans:
+                    for i in range(span.start, span.end):
+                        self.ner[i] = biluo_tags[i]

            # Prevent whitespace that isn't within entities from being tagged as
            # an entity.
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -1,7 +1,9 @@
 # cython: infer_types=True
 # cython: profile=True
 # coding: utf8
-from spacy.errors import Errors, Warnings, user_warning
+import warnings
+
+from spacy.errors import Errors, Warnings

 from pathlib import Path
 from cymem.cymem cimport Pool
@ -115,7 +117,7 @@ cdef class KnowledgeBase:

        # Return if this entity was added before
        if entity_hash in self._entry_index:
-            user_warning(Warnings.W018.format(entity=entity))
+            warnings.warn(Warnings.W018.format(entity=entity))
            return

        # Raise an error if the provided entity vector is not of the correct length
@ -147,7 +149,7 @@ cdef class KnowledgeBase:
            # only process this entity if its unique ID hadn't been added before
            entity_hash = self.vocab.strings.add(entity_list[i])
            if entity_hash in self._entry_index:
-                user_warning(Warnings.W018.format(entity=entity_list[i]))
+                warnings.warn(Warnings.W018.format(entity=entity_list[i]))

            else:
                entity_vector = vector_list[i]
@ -195,7 +197,7 @@ cdef class KnowledgeBase:

        # Check whether this alias was added before
        if alias_hash in self._alias_index:
-            user_warning(Warnings.W017.format(alias=alias))
+            warnings.warn(Warnings.W017.format(alias=alias))
            return

        cdef vector[int64_t] entry_indices
@ -252,7 +254,7 @@ cdef class KnowledgeBase:

        if is_present:
            if not ignore_warnings:
-                user_warning(Warnings.W024.format(entity=entity, alias=alias))
+                warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
        else:
            entry_indices.push_back(int(entry_index))
            alias_entry.entry_indices = entry_indices
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@ -9,10 +9,13 @@ Example sentences to test spaCy and its language models.
 >>> docs = nlp.pipe(sentences)
 """

-
 sentences = [
-    "Apple overvejer at købe et britisk startup for 1 milliard dollar",
-    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
-    "San Francisco overvejer at forbyde udbringningsrobotter på fortov",
-    "London er en stor by i Storbritannien",
+    "Apple overvejer at købe et britisk startup for 1 milliard dollar.",
+    "Selvkørende biler flytter forsikringsansvaret over på producenterne.",
+    "San Francisco overvejer at forbyde udbringningsrobotter på fortovet.",
+    "London er en storby i Storbritannien.",
+    "Hvor er du?",
+    "Hvem er Frankrings president?",
+    "Hvad er hovedstaden i USA?",
+    "Hvornår blev Barack Obama født?",
 ]
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -70,6 +70,7 @@ for orth in [
    "A/S",
    "B.C.",
    "BK.",
+    "B.T.",
    "Dr.",
    "Boul.",
    "Chr.",
@ -79,6 +80,7 @@ for orth in [
    "Hf.",
    "i/s",
    "I/S",
+    "Inc.",
    "Kprs.",
    "L.A.",
    "Ll.",
@ -149,6 +151,7 @@ for orth in [
    "bygn.",
    "c/o",
    "ca.",
+    "cm.",
    "cand.",
    "d.d.",
    "d.m.",
@ -172,10 +175,12 @@ for orth in [
    "dl.",
    "do.",
    "dobb.",
+    "dr.",
    "dr.h.c",
    "dr.phil.",
    "ds.",
    "dvs.",
+    "d.v.s.",
    "e.b.",
    "e.l.",
    "e.o.",
@ -297,10 +302,14 @@ for orth in [
    "kap.",
    "kbh.",
    "kem.",
+    "kg.",
+    "kgs.",
    "kgl.",
    "kl.",
    "kld.",
+    "km.",
    "km/t",
+    "km/t.",
    "knsp.",
    "komm.",
    "kons.",
@ -311,6 +320,7 @@ for orth in [
    "kt.",
    "ktr.",
    "kv.",
+    "kvm.",
    "kvt.",
    "l.c.",
    "lab.",
@ -357,6 +367,7 @@ for orth in [
    "nto.",
    "nuv.",
    "o/m",
+    "o/m.",
    "o.a.",
    "o.fl.",
    "o.h.",
@ -526,6 +537,7 @@ for orth in [
    "vejl.",
    "vh.",
    "vha.",
+    "vind.",
    "vs.",
    "vsa.",
    "vær.",
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@ -2,12 +2,12 @@
 from __future__ import unicode_literals

 from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
+from ..char_classes import CURRENCY, UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..punctuation import _prefixes, _suffixes
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES


-_prefixes = ["``",] + list(_prefixes)
+_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES

 _suffixes = (
    ["''", "/"]
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -6,6 +6,7 @@ from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS

--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@ -26,6 +26,15 @@ _num_words = [
    "dieciocho",
    "diecinueve",
    "veinte",
+    "veintiuno",
+    "veintidós",
+    "veintitrés",
+    "veinticuatro",
+    "veinticinco",
+    "veintiséis",
+    "veintisiete",
+    "veintiocho",
+    "veintinueve",
    "treinta",
    "cuarenta",
    "cincuenta",
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -0,0 +1,48 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
+from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import merge_chars
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
+
+
+_list_units = [u for u in LIST_UNITS if u != "%"]
+_units = merge_chars(" ".join(_list_units))
+_concat_quotes = CONCAT_QUOTES + "—–"
+
+
+_suffixes = (
+    ["—", "–"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=_units),
+        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT
+        ),
+        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+    ]
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@ -43,14 +43,16 @@ for orth in [
    "Av.",
    "Avda.",
    "Cía.",
+    "EE.UU.",
    "etc.",
+    "fig.",
    "Gob.",
    "Gral.",
    "Ing.",
    "J.C.",
+    "km/h",
    "Lic.",
    "m.n.",
-    "no.",
    "núm.",
    "P.D.",
    "Prof.",
--- a/spacy/lang/eu/examples.py
+++ b/spacy/lang/eu/examples.py
@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models.

 sentences = [
    "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
-    "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
+    "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
 ]
--- a/spacy/lang/eu/lex_attrs.py
+++ b/spacy/lang/eu/lex_attrs.py
@ -59,7 +59,6 @@ behin
 """.split()


-
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
--- a/spacy/lang/eu/stop_words.py
+++ b/spacy/lang/eu/stop_words.py
@ -5,7 +5,7 @@ from __future__ import unicode_literals
 # https://www.ranks.nl/stopwords/basque
 # https://www.mustgo.com/worldlanguages/basque/
 STOP_WORDS = set(
-"""
+    """
 al
 anitz
 arabera
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -2,7 +2,8 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_SUFFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -27,6 +28,7 @@ class FrenchDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
+    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    token_match = TOKEN_MATCH
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@ -1,15 +1,26 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
+from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import merge_chars


-ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
-HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "")
+ELISION = "' ’".replace(" ", "")
+HYPHENS = r"- – — ‐ ‑".replace(" ", "")
+_prefixes_elision = "d l n"
+_prefixes_elision += " " + _prefixes_elision.upper()
+_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
+_hyphen_suffixes += " " + _hyphen_suffixes.upper()


+_prefixes = TOKENIZER_PREFIXES + [
+    r"(?:({pe})[{el}])(?=[{a}])".format(
+        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+    )
+]
+
 _suffixes = (
    LIST_PUNCT
    + LIST_ELLIPSES
@ -17,7 +28,6 @@ _suffixes = (
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",  # °C. -> ["°C", "."]
-        r"(?<=[0-9])°[FfCcKk]",  # 4°C -> ["4", "°C"]
        r"(?<=[0-9])%",  # 4% -> ["4", "%"]
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
@ -25,14 +35,17 @@ _suffixes = (
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+        r"(?<=[{a}])[{h}]({hs})".format(
+            a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
+        ),
    ]
 )

-
 _infixes = TOKENIZER_INFIXES + [
    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
 ]


+TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -6,7 +6,7 @@ import re
 from .punctuation import ELISION, HYPHENS
 from ..tokenizer_exceptions import URL_PATTERN
 from ..char_classes import ALPHA_LOWER, ALPHA
-from ...symbols import ORTH, LEMMA, TAG
+from ...symbols import ORTH, LEMMA

 # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
 # from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
@ -56,7 +56,28 @@ for exc_data in [
    _exc[exc_data[ORTH]] = [exc_data]


-for orth in ["etc."]:
+for orth in [
+    "après-midi",
+    "au-delà",
+    "au-dessus",
+    "celle-ci",
+    "celles-ci",
+    "celui-ci",
+    "cf.",
+    "ci-dessous",
+    "elle-même",
+    "en-dessous",
+    "etc.",
+    "jusque-là",
+    "lui-même",
+    "MM.",
+    "No.",
+    "peut-être",
+    "pp.",
+    "quelques-uns",
+    "rendez-vous",
+    "Vol.",
+]:
    _exc[orth] = [{ORTH: orth}]


@ -72,7 +93,7 @@ for verb, verb_lemma in [
        for pronoun in ["elle", "il", "on"]:
            token = "{}-t-{}".format(orth, pronoun)
            _exc[token] = [
-                {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
+                {LEMMA: verb_lemma, ORTH: orth},  # , TAG: "VERB"},
                {LEMMA: "t", ORTH: "-t"},
                {LEMMA: pronoun, ORTH: "-" + pronoun},
            ]
@ -81,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
    for orth in [verb, verb.title()]:
        token = "{}-ce".format(orth)
        _exc[token] = [
-            {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
+            {LEMMA: verb_lemma, ORTH: orth},  # , TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]

@ -89,12 +110,29 @@ for verb, verb_lemma in [("est", "être")]:
 for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
    for orth in [pre, pre.title()]:
        _exc["%sest-ce" % orth] = [
-            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
-            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
+            {LEMMA: pre_lemma, ORTH: orth},
+            {LEMMA: "être", ORTH: "est"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


+for verb, pronoun in [("est", "il"), ("EST", "IL")]:
+    token = "{}-{}".format(verb, pronoun)
+    _exc[token] = [
+        {LEMMA: "être", ORTH: verb},
+        {LEMMA: pronoun, ORTH: "-" + pronoun},
+    ]
+
+
+for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
+    token = "{}'{}-{}".format(s, verb, pronoun)
+    _exc[token] = [
+        {LEMMA: "se", ORTH: s + "'"},
+        {LEMMA: "être", ORTH: verb},
+        {LEMMA: pronoun, ORTH: "-" + pronoun},
+    ]
+
+
 _infixes_exc = []
 orig_elision = "'"
 orig_hyphen = "-"
@ -423,5 +461,5 @@ _regular_exp.append(URL_PATTERN)

 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
-    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
+        "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
 ).match
--- a/spacy/lang/gu/init.py
+++ b/spacy/lang/gu/init.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+from ...language import Language
+
+
+class GujaratiDefaults(Language.Defaults):
+    stop_words = STOP_WORDS
+
+
+class Gujarati(Language):
+    lang = "gu"
+    Defaults = GujaratiDefaults
+
+
+__all__ = ["Gujarati"]
--- a/spacy/lang/gu/examples.py
+++ b/spacy/lang/gu/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.gu.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.",
+    "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું",
+    "કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો",
+    "તેજપાળને બે પત્ની હતી",
+    "ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો",
+    "આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.",
+    "અહિયાં શું જોડાય છે?",
+    "મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.",
+]
--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@ -0,0 +1,91 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set(
+    """
+એમ 
+આ
+એ
+રહી
+છે
+છો
+હતા
+હતું
+હતી
+હોય
+હતો
+શકે
+તે
+તેના
+તેનું
+તેને
+તેની
+તેઓ
+તેમને
+તેમના
+તેમણે
+તેમનું 
+તેમાં
+અને
+અહીં
+થી
+થઈ
+થાય
+જે
+ ને
+કે 
+ના
+ની
+નો
+ને
+નું 
+શું
+માં
+પણ
+પર
+જેવા
+જેવું
+જાય
+જેમ
+જેથી
+માત્ર
+માટે
+પરથી
+આવ્યું
+એવી
+આવી
+રીતે
+સુધી
+થાય
+થઈ
+સાથે
+લાગે
+હોવા
+છતાં
+રહેલા
+કરી
+કરે
+કેટલા
+કોઈ
+કેમ
+કર્યો
+કર્યુ 
+કરે
+સૌથી
+ત્યારબાદ 
+તથા
+દ્વારા 
+જુઓ
+જાઓ
+જ્યારે
+ત્યારે
+શકો
+નથી
+હવે
+અથવા
+થતો
+દર
+એટલો
+પરંતુ
+""".split()
+)
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -0,0 +1,25 @@
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from .tag_map import TAG_MAP
+
+
+from ...attrs import LANG
+from ...language import Language
+from ...tokens import Doc
+
+
+class ArmenianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: "hy"
+
+    lex_attr_getters.update(LEX_ATTRS)
+    stop_words = STOP_WORDS
+    tag_map = TAG_MAP
+
+
+class Armenian(Language):
+    lang = "hy"
+    Defaults = ArmenianDefaults
+
+
+__all__ = ["Armenian"]
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.hy.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
+    "Ո՞վ է Ֆրանսիայի նախագահը։",
+    "Որն է Միացյալ Նահանգների մայրաքաղաքը։",
+    "Ե՞րբ է ծնվել Բարաք Օբաման։",
+]
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = [
+    "զրօ",
+    "մէկ",
+    "երկու",
+    "երեք",
+    "չորս",
+    "հինգ",
+    "վեց",
+    "յոթ",
+    "ութ",
+    "ինը",
+    "տասը",
+    "տասնմեկ",
+    "տասներկու",
+    "տասներեք",
+    "տասնչորս",
+    "տասնհինգ",
+    "տասնվեց",
+    "տասնյոթ",
+    "տասնութ",
+    "տասնինը",
+    "քսան" "երեսուն",
+    "քառասուն",
+    "հիսուն",
+    "վաթցսուն",
+    "յոթանասուն",
+    "ութսուն",
+    "ինիսուն",
+    "հարյուր",
+    "հազար",
+    "միլիոն",
+    "միլիարդ",
+    "տրիլիոն",
+    "քվինտիլիոն",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text.lower() in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -0,0 +1,110 @@
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set(
+    """
+նա
+ողջը
+այստեղ
+ենք
+նա
+էիր
+որպես
+ուրիշ
+բոլորը
+այն
+այլ
+նույնչափ
+էի
+մի
+և
+ողջ
+ես
+ոմն
+հետ
+նրանք
+ամենքը
+ըստ
+ինչ-ինչ
+այսպես
+համայն
+մի
+նաև
+նույնքան
+դա
+ովևէ
+համար
+այնտեղ
+էին
+որոնք
+սույն
+ինչ-որ
+ամենը
+նույնպիսի
+ու
+իր
+որոշ
+միևնույն
+ի
+այնպիսի
+մենք
+ամեն ոք
+նույն
+երբևէ
+այն
+որևէ
+ին
+այդպես
+նրա
+որը
+վրա
+դու
+էինք
+այդպիսի
+էիք
+յուրաքանչյուրը
+եմ
+պիտի
+այդ
+ամբողջը
+հետո
+եք
+ամեն
+այլ
+կամ
+այսքան
+որ
+այնպես
+այսինչ
+բոլոր
+է
+մեկնումեկը
+այդչափ
+այնքան
+ամբողջ
+երբևիցե
+այնչափ
+ամենայն
+մյուս
+այնինչ
+իսկ
+այդտեղ
+այս
+սա
+են
+ամեն ինչ
+որևիցե
+ում
+մեկը
+այդ
+դուք
+այսչափ
+այդքան
+այսպիսի
+էր
+յուրաքանչյուր
+այս
+մեջ
+թ	
+""".split()
+)
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
+    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES


--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@ -1,15 +1,32 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import ALPHA
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER


-ELISION = " ' ’ ".strip().replace(" ", "")
+ELISION = "'’"


-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
+_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES

+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
+        r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
+    ]
+)
+
+TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@ -2,6 +2,56 @@
 from __future__ import unicode_literals
 from ...symbols import ORTH, LEMMA

-_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
+_exc = {
+    "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
+    "dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}],
+    "dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}],
+    "L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
+    "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
+    "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
+    "po'": [{ORTH: "po'", LEMMA: "poco"}],
+    "sett..": [{ORTH: "sett."}, {ORTH: "."}],
+}
+
+for orth in [
+    "..",
+    "....",
+    "al.",
+    "all-path",
+    "art.",
+    "Art.",
+    "artt.",
+    "att.",
+    "by-pass",
+    "c.d.",
+    "centro-sinistra",
+    "check-up",
+    "Civ.",
+    "cm.",
+    "Cod.",
+    "col.",
+    "Cost.",
+    "d.C.",
+    'de"',
+    "distr.",
+    "E'",
+    "ecc.",
+    "e-mail",
+    "e/o",
+    "etc.",
+    "Jr.",
+    "n°",
+    "nord-est",
+    "pag.",
+    "Proc.",
+    "prof.",
+    "sett.",
+    "s.p.a.",
+    "ss.",
+    "St.",
+    "tel.",
+    "week-end",
+]:
+    _exc[orth] = [{ORTH: orth}]

 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/kn/examples.py
+++ b/spacy/lang/kn/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.",
+    "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.",
+    "ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್‌ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ಪರಿಗಣಿಸುತ್ತದೆ.",
+    "ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್‌ಡಂನ ದೊಡ್ಡ ನಗರ.",
+    "ನೀನು ಎಲ್ಲಿದಿಯಾ?",
+    "ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?",
+    "ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?",
+    "ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?",
+]
--- a/spacy/lang/lij/init.py
+++ b/spacy/lang/lij/init.py
@ -0,0 +1,31 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .punctuation import TOKENIZER_INFIXES
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class LigurianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: "lij"
+    lex_attr_getters[NORM] = add_lookups(
+        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
+    )
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = STOP_WORDS
+    infixes = TOKENIZER_INFIXES
+
+
+class Ligurian(Language):
+    lang = "lij"
+    Defaults = LigurianDefaults
+
+
+__all__ = ["Ligurian"]
--- a/spacy/lang/lij/examples.py
+++ b/spacy/lang/lij/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.lij.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Sciusciâ e sciorbî no se peu.",
+    "Graçie di çetroin, che me son arrivæ.",
+    "Vegnime apreuvo, che ve fasso pescâ di òmmi.",
+    "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
+]
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@ -0,0 +1,15 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_INFIXES
+from ..char_classes import ALPHA
+
+
+ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
+
+
+_infixes = TOKENIZER_INFIXES + [
+    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
+]
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/lij/stop_words.py
+++ b/spacy/lang/lij/stop_words.py
@ -0,0 +1,43 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set(
+    """
+a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
+
+bella belle belli bello ben
+
+ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
+
+d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
+
+é e ê ea ean emmo en ëse
+
+fin fiña
+
+gh' ghe guæei
+
+i î in insemme int' inta inte inti into
+
+l' lê lì lô
+
+m' ma manco me megio meno mezo mi
+
+na n' ne ni ninte nisciun nisciuña no
+
+o ò ô oua
+
+parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio
+
+quæ quand' quande quarche quella quelle quelli quello
+
+s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
+
+tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
+
+un uña unn' unna
+
+za zu
+""".split()
+)
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@ -0,0 +1,52 @@
+# coding: utf8
+from __future__ import unicode_literals
+from ...symbols import ORTH, LEMMA
+
+_exc = {}
+
+for raw, lemma in [
+    ("a-a", "a-o"),
+    ("a-e", "a-o"),
+    ("a-o", "a-o"),
+    ("a-i", "a-o"),
+    ("co-a", "co-o"),
+    ("co-e", "co-o"),
+    ("co-i", "co-o"),
+    ("co-o", "co-o"),
+    ("da-a", "da-o"),
+    ("da-e", "da-o"),
+    ("da-i", "da-o"),
+    ("da-o", "da-o"),
+    ("pe-a", "pe-o"),
+    ("pe-e", "pe-o"),
+    ("pe-i", "pe-o"),
+    ("pe-o", "pe-o"),
+]:
+    for orth in [raw, raw.capitalize()]:
+        _exc[orth] = [{ORTH: orth, LEMMA: lemma}]
+
+# Prefix + prepositions with à (e.g. "sott'a-o")
+
+for prep, prep_lemma in [
+    ("a-a", "a-o"),
+    ("a-e", "a-o"),
+    ("a-o", "a-o"),
+    ("a-i", "a-o"),
+]:
+    for prefix, prefix_lemma in [
+        ("sott'", "sotta"),
+        ("sott’", "sotta"),
+        ("contr'", "contra"),
+        ("contr’", "contra"),
+        ("ch'", "che"),
+        ("ch’", "che"),
+        ("s'", "se"),
+        ("s’", "se"),
+    ]:
+        for prefix_orth in [prefix, prefix.capitalize()]:
+            _exc[prefix_orth + prep] = [
+                {ORTH: prefix_orth, LEMMA: prefix_lemma},
+                {ORTH: prep, LEMMA: prep_lemma},
+            ]
+
+TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/lt/init.py
+++ b/spacy/lang/lt/init.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -26,7 +27,13 @@ class LithuanianDefaults(Language.Defaults):
    )
    lex_attr_getters.update(LEX_ATTRS)

-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    mod_base_exceptions = {
+        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
+    }
+    del mod_base_exceptions["8)"]
+    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    morph_rules = MORPH_RULES
--- a/spacy/lang/lt/punctuation.py
+++ b/spacy/lang/lt/punctuation.py
@ -0,0 +1,29 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import LIST_ICONS, LIST_ELLIPSES
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import HYPHENS
+from ..punctuation import TOKENIZER_SUFFIXES
+
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+
+_suffixes = ["\."] + list(TOKENIZER_SUFFIXES)
+
+
+TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/lang/lt/tokenizer_exceptions.py
+++ b/spacy/lang/lt/tokenizer_exceptions.py
@ -6,262 +6,264 @@ from ...symbols import ORTH
 _exc = {}

 for orth in [
-    "G.",
-    "J. E.",
-    "J. Em.",
-    "J.E.",
-    "J.Em.",
-    "K.",
-    "N.",
-    "V.",
-    "Vt.",
-    "a.",
-    "a.k.",
-    "a.s.",
-    "adv.",
-    "akad.",
-    "aklg.",
-    "akt.",
-    "al.",
-    "ang.",
-    "angl.",
-    "aps.",
-    "apskr.",
-    "apyg.",
-    "arbat.",
-    "asist.",
-    "asm.",
-    "asm.k.",
-    "asmv.",
-    "atk.",
-    "atsak.",
-    "atsisk.",
-    "atsisk.sąsk.",
-    "atv.",
-    "aut.",
-    "avd.",
-    "b.k.",
-    "baud.",
-    "biol.",
-    "bkl.",
-    "bot.",
-    "bt.",
-    "buv.",
-    "ch.",
-    "chem.",
-    "corp.",
-    "d.",
-    "dab.",
-    "dail.",
-    "dek.",
-    "deš.",
-    "dir.",
-    "dirig.",
-    "doc.",
-    "dol.",
-    "dr.",
-    "drp.",
-    "dvit.",
-    "dėst.",
-    "dš.",
-    "dž.",
-    "e.b.",
-    "e.bankas",
-    "e.p.",
-    "e.parašas",
-    "e.paštas",
-    "e.v.",
-    "e.valdžia",
-    "egz.",
-    "eil.",
-    "ekon.",
-    "el.",
-    "el.bankas",
-    "el.p.",
-    "el.parašas",
-    "el.paštas",
-    "el.valdžia",
-    "etc.",
-    "ež.",
-    "fak.",
-    "faks.",
-    "feat.",
-    "filol.",
-    "filos.",
-    "g.",
-    "gen.",
-    "geol.",
-    "gerb.",
-    "gim.",
-    "gr.",
-    "gv.",
-    "gyd.",
-    "gyv.",
-    "habil.",
-    "inc.",
-    "insp.",
-    "inž.",
-    "ir pan.",
-    "ir t. t.",
-    "isp.",
-    "istor.",
-    "it.",
-    "just.",
-    "k.",
-    "k. a.",
-    "k.a.",
-    "kab.",
-    "kand.",
-    "kart.",
-    "kat.",
-    "ketv.",
-    "kh.",
-    "kl.",
-    "kln.",
-    "km.",
-    "kn.",
-    "koresp.",
-    "kpt.",
-    "kr.",
-    "kt.",
-    "kub.",
-    "kun.",
-    "kv.",
-    "kyš.",
-    "l. e. p.",
-    "l.e.p.",
-    "lenk.",
-    "liet.",
-    "lot.",
-    "lt.",
-    "ltd.",
-    "ltn.",
-    "m.",
-    "m.e..",
-    "m.m.",
-    "mat.",
-    "med.",
-    "mgnt.",
-    "mgr.",
-    "min.",
-    "mjr.",
-    "ml.",
-    "mln.",
-    "mlrd.",
-    "mob.",
-    "mok.",
-    "moksl.",
-    "mokyt.",
-    "mot.",
-    "mr.",
-    "mst.",
-    "mstl.",
-    "mėn.",
-    "nkt.",
-    "no.",
-    "nr.",
-    "ntk.",
-    "nuotr.",
-    "op.",
-    "org.",
-    "orig.",
-    "p.",
-    "p.d.",
-    "p.m.e.",
-    "p.s.",
-    "pab.",
-    "pan.",
-    "past.",
-    "pav.",
-    "pavad.",
-    "per.",
-    "perd.",
-    "pirm.",
-    "pl.",
-    "plg.",
-    "plk.",
-    "pr.",
-    "pr.Kr.",
-    "pranc.",
-    "proc.",
-    "prof.",
-    "prom.",
-    "prot.",
-    "psl.",
-    "pss.",
-    "pvz.",
-    "pšt.",
-    "r.",
-    "raj.",
-    "red.",
-    "rez.",
-    "rež.",
-    "rus.",
-    "rš.",
-    "s.",
-    "sav.",
-    "saviv.",
-    "sek.",
-    "sekr.",
-    "sen.",
-    "sh.",
-    "sk.",
-    "skg.",
-    "skv.",
-    "skyr.",
-    "sp.",
-    "spec.",
-    "sr.",
-    "st.",
-    "str.",
-    "stud.",
-    "sąs.",
-    "t.",
-    "t. p.",
-    "t. y.",
-    "t.p.",
-    "t.t.",
-    "t.y.",
-    "techn.",
-    "tel.",
-    "teol.",
-    "th.",
-    "tir.",
-    "trit.",
-    "trln.",
-    "tšk.",
-    "tūks.",
-    "tūkst.",
-    "up.",
-    "upl.",
-    "v.s.",
-    "vad.",
-    "val.",
-    "valg.",
-    "ved.",
-    "vert.",
-    "vet.",
-    "vid.",
-    "virš.",
-    "vlsč.",
-    "vnt.",
-    "vok.",
-    "vs.",
-    "vtv.",
-    "vv.",
-    "vyr.",
-    "vyresn.",
-    "zool.",
-    "Įn",
-    "įl.",
-    "š.m.",
-    "šnek.",
-    "šv.",
-    "švč.",
-    "ž.ū.",
-    "žin.",
-    "žml.",
-    "žr.",
+    "n-tosios",
+    "?!",
+    #    "G.",
+    #    "J. E.",
+    #    "J. Em.",
+    #    "J.E.",
+    #    "J.Em.",
+    #    "K.",
+    #    "N.",
+    #    "V.",
+    #    "Vt.",
+    #    "a.",
+    #    "a.k.",
+    #    "a.s.",
+    #    "adv.",
+    #    "akad.",
+    #    "aklg.",
+    #    "akt.",
+    #    "al.",
+    #    "ang.",
+    #    "angl.",
+    #    "aps.",
+    #    "apskr.",
+    #    "apyg.",
+    #    "arbat.",
+    #    "asist.",
+    #    "asm.",
+    #    "asm.k.",
+    #    "asmv.",
+    #    "atk.",
+    #    "atsak.",
+    #    "atsisk.",
+    #    "atsisk.sąsk.",
+    #    "atv.",
+    #    "aut.",
+    #    "avd.",
+    #    "b.k.",
+    #    "baud.",
+    #    "biol.",
+    #    "bkl.",
+    #    "bot.",
+    #    "bt.",
+    #    "buv.",
+    #    "ch.",
+    #    "chem.",
+    #    "corp.",
+    #    "d.",
+    #    "dab.",
+    #    "dail.",
+    #    "dek.",
+    #    "deš.",
+    #    "dir.",
+    #    "dirig.",
+    #    "doc.",
+    #    "dol.",
+    #    "dr.",
+    #    "drp.",
+    #    "dvit.",
+    #    "dėst.",
+    #    "dš.",
+    #    "dž.",
+    #    "e.b.",
+    #    "e.bankas",
+    #    "e.p.",
+    #    "e.parašas",
+    #    "e.paštas",
+    #    "e.v.",
+    #    "e.valdžia",
+    #    "egz.",
+    #    "eil.",
+    #    "ekon.",
+    #    "el.",
+    #    "el.bankas",
+    #    "el.p.",
+    #    "el.parašas",
+    #    "el.paštas",
+    #    "el.valdžia",
+    #    "etc.",
+    #    "ež.",
+    #    "fak.",
+    #    "faks.",
+    #    "feat.",
+    #    "filol.",
+    #    "filos.",
+    #    "g.",
+    #    "gen.",
+    #    "geol.",
+    #    "gerb.",
+    #    "gim.",
+    #    "gr.",
+    #    "gv.",
+    #    "gyd.",
+    #    "gyv.",
+    #    "habil.",
+    #    "inc.",
+    #    "insp.",
+    #    "inž.",
+    #    "ir pan.",
+    #    "ir t. t.",
+    #    "isp.",
+    #    "istor.",
+    #    "it.",
+    #    "just.",
+    #    "k.",
+    #    "k. a.",
+    #    "k.a.",
+    #    "kab.",
+    #    "kand.",
+    #    "kart.",
+    #    "kat.",
+    #    "ketv.",
+    #    "kh.",
+    #    "kl.",
+    #    "kln.",
+    #    "km.",
+    #    "kn.",
+    #    "koresp.",
+    #    "kpt.",
+    #    "kr.",
+    #    "kt.",
+    #    "kub.",
+    #    "kun.",
+    #    "kv.",
+    #    "kyš.",
+    #    "l. e. p.",
+    #    "l.e.p.",
+    #    "lenk.",
+    #    "liet.",
+    #    "lot.",
+    #    "lt.",
+    #    "ltd.",
+    #    "ltn.",
+    #    "m.",
+    #    "m.e..",
+    #    "m.m.",
+    #    "mat.",
+    #    "med.",
+    #    "mgnt.",
+    #    "mgr.",
+    #    "min.",
+    #    "mjr.",
+    #    "ml.",
+    #    "mln.",
+    #    "mlrd.",
+    #    "mob.",
+    #    "mok.",
+    #    "moksl.",
+    #    "mokyt.",
+    #    "mot.",
+    #    "mr.",
+    #    "mst.",
+    #    "mstl.",
+    #    "mėn.",
+    #    "nkt.",
+    #    "no.",
+    #    "nr.",
+    #    "ntk.",
+    #    "nuotr.",
+    #    "op.",
+    #    "org.",
+    #    "orig.",
+    #    "p.",
+    #    "p.d.",
+    #    "p.m.e.",
+    #    "p.s.",
+    #    "pab.",
+    #    "pan.",
+    #    "past.",
+    #    "pav.",
+    #    "pavad.",
+    #    "per.",
+    #    "perd.",
+    #    "pirm.",
+    #    "pl.",
+    #    "plg.",
+    #    "plk.",
+    #    "pr.",
+    #    "pr.Kr.",
+    #    "pranc.",
+    #    "proc.",
+    #    "prof.",
+    #    "prom.",
+    #    "prot.",
+    #    "psl.",
+    #    "pss.",
+    #    "pvz.",
+    #    "pšt.",
+    #    "r.",
+    #    "raj.",
+    #    "red.",
+    #    "rez.",
+    #    "rež.",
+    #    "rus.",
+    #    "rš.",
+    #    "s.",
+    #    "sav.",
+    #    "saviv.",
+    #    "sek.",
+    #    "sekr.",
+    #    "sen.",
+    #    "sh.",
+    #    "sk.",
+    #    "skg.",
+    #    "skv.",
+    #    "skyr.",
+    #    "sp.",
+    #    "spec.",
+    #    "sr.",
+    #    "st.",
+    #    "str.",
+    #    "stud.",
+    #    "sąs.",
+    #    "t.",
+    #    "t. p.",
+    #    "t. y.",
+    #    "t.p.",
+    #    "t.t.",
+    #    "t.y.",
+    #    "techn.",
+    #    "tel.",
+    #    "teol.",
+    #    "th.",
+    #    "tir.",
+    #    "trit.",
+    #    "trln.",
+    #    "tšk.",
+    #    "tūks.",
+    #    "tūkst.",
+    #    "up.",
+    #    "upl.",
+    #    "v.s.",
+    #    "vad.",
+    #    "val.",
+    #    "valg.",
+    #    "ved.",
+    #    "vert.",
+    #    "vet.",
+    #    "vid.",
+    #    "virš.",
+    #    "vlsč.",
+    #    "vnt.",
+    #    "vok.",
+    #    "vs.",
+    #    "vtv.",
+    #    "vv.",
+    #    "vyr.",
+    #    "vyresn.",
+    #    "zool.",
+    #    "Įn",
+    #    "įl.",
+    #    "š.m.",
+    #    "šnek.",
+    #    "šv.",
+    #    "švč.",
+    #    "ž.ū.",
+    #    "žin.",
+    #    "žml.",
+    #    "žr.",
 ]:
    _exc[orth] = [{ORTH: orth}]

--- a/spacy/lang/ml/init.py
+++ b/spacy/lang/ml/init.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+from ...language import Language
+
+
+class MalayalamDefaults(Language.Defaults):
+    stop_words = STOP_WORDS
+
+
+class Malayalam(Language):
+    lang = "ml"
+    Defaults = MalayalamDefaults
+
+
+__all__ = ["Malayalam"]
--- a/spacy/lang/ml/examples.py
+++ b/spacy/lang/ml/examples.py
@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ml.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക",
+    "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി",
+    "എന്താണ്‌ കവാടങ്ങൾ?",
+    "ചുരുക്കത്തിൽ വിക്കിപീഡിയയുടെ ഉള്ളടക്കത്തിലേക്കുള്ള പടിപ്പുരകളാണ്‌‌ കവാടങ്ങൾ. അവ ലളിതവും വായനക്കാരനെ ആകർഷിക്കുന്നതുമായിരിക്കും",
+    "പതിനൊന്നുപേർ വീതമുള്ള രണ്ടു ടീമുകൾ കളിക്കുന്ന സംഘകായിക വിനോദമാണു ക്രിക്കറ്റ്",
+]
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@ -0,0 +1,80 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+# reference 2: https://www.omniglot.com/language/numbers/malayalam.htm
+
+_num_words = [
+    "പൂജ്യം ",
+    "ഒന്ന് ",
+    "രണ്ട് ",
+    "മൂന്ന് ",
+    "നാല്‌ ",
+    "അഞ്ച് ",
+    "ആറ് ",
+    "ഏഴ് ",
+    "എട്ട് ",
+    "ഒന്‍പത് ",
+    "പത്ത് ",
+    "പതിനൊന്ന്",
+    "പന്ത്രണ്ട്",
+    "പതി മൂന്നു",
+    "പതിനാല്",
+    "പതിനഞ്ച്",
+    "പതിനാറ്",
+    "പതിനേഴ്",
+    "പതിനെട്ട്",
+    "പത്തൊമ്പതു",
+    "ഇരുപത്",
+    "ഇരുപത്തിഒന്ന്",
+    "ഇരുപത്തിരണ്ട്‌",
+    "ഇരുപത്തിമൂന്ന്",
+    "ഇരുപത്തിനാല്",
+    "ഇരുപത്തിഅഞ്ചു",
+    "ഇരുപത്തിആറ്",
+    "ഇരുപത്തിഏഴ്",
+    "ഇരുപത്തിഎട്ടു",
+    "ഇരുപത്തിഒന്‍പത്",
+    "മുപ്പത്",
+    "മുപ്പത്തിഒന്ന്",
+    "മുപ്പത്തിരണ്ട്",
+    "മുപ്പത്തിമൂന്ന്",
+    "മുപ്പത്തിനാല്",
+    "മുപ്പത്തിഅഞ്ചു",
+    "മുപ്പത്തിആറ്",
+    "മുപ്പത്തിഏഴ്",
+    "മുപ്പത്തിഎട്ട്",
+    "മുപ്പത്തിഒന്‍പതു",
+    "നാല്‍പത്‌ ",
+    "അന്‍പത് ",
+    "അറുപത് ",
+    "എഴുപത് ",
+    "എണ്‍പത് ",
+    "തൊണ്ണൂറ് ",
+    "നുറ് ",
+    "ആയിരം ",
+    "പത്തുലക്ഷം"
+]
+
+
+def like_num(text):
+    """
+    Check if text resembles a number
+    """
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+STOP_WORDS = set(
+
+    """
+അത്
+ഇത്
+ആയിരുന്നു
+ആകുന്നു
+വരെ
+അന്നേരം
+അന്ന്
+ഇന്ന്
+ആണ്
+""".split()
+)
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -2,6 +2,8 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .morph_rules import MORPH_RULES
 from .syntax_iterators import SYNTAX_ITERATORS
@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults):
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    prefixes = TOKENIZER_PREFIXES
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    morph_rules = MORPH_RULES
    tag_map = TAG_MAP
--- a/Show More
+++ b/Show More