Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-31 01:15:06 +03:00 · 2018-11-30 20:58:51 +00:00 · 2018-11-30 20:58:51 +00:00 · 4aa1002546
commit 4aa1002546
parent 6bd1cc57ee 919729d38c
424 changed files with 1027251 additions and 226419 deletions
--- a/.flake8
+++ b/.flake8
@ -1,4 +1,14 @@
 [flake8]
-ignore = E203, E266, E501, W503
+ignore = E203, E266, E501, E731, W503
 max-line-length = 80
 select = B,C,E,F,W,T4,B9
 exclude =
    .env,
    .git,
    __pycache__,
    lemmatizer.py,
    lookup.py,
    _tokenizer_exceptions_list.py,
    spacy/lang/fr/lemmatizer,
    spacy/lang/nb/lemmatizer
    spacy/__init__.py
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -1,7 +1,7 @@
 <!--- Please provide a summary in the title and describe your issue here.
 Is this a bug or feature request? If a bug, include all the steps that led to the issue.
-If you're looking for help with your code, consider posting a question on StackOverflow instead:
+If you're looking for help with your code, consider posting a question on Stack Overflow instead:
 http://stackoverflow.com/questions/tagged/spacy -->
--- a/.github/ISSUE_TEMPLATE/05_other.md
+++ b/.github/ISSUE_TEMPLATE/05_other.md
@ -1,11 +1,11 @@
 ---
 name: "\U0001F4AC Anything else?"
 about: For general usage questions or help with your code, please consider
-  posting on StackOverflow instead.
+  posting on Stack Overflow instead.
 ---
-<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on StackOverflow instead: http://stackoverflow.com/questions/tagged/spacy -->
+<!-- Describe your issue here. Please keep in mind that the GitHub issue tracker is mostly intended for reports related to the spaCy code base and source, and for bugs and feature requests. If you're looking for help with your code, consider posting a question on Stack Overflow instead: http://stackoverflow.com/questions/tagged/spacy -->
 ## Your Environment
 <!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
--- a/.github/contributors/ALSchwalm.md
+++ b/.github/contributors/ALSchwalm.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                    |
 |------------------------------- | ------------------------ |
 | Name                           | Adam Schwalm             |
 | Company name (if applicable)   | Star Lab                 |
 | Title or role (if applicable)  | Software Engineer        |
 | Date                           | 2018-11-28               |
 | GitHub username                | ALSchwalm                |
 | Website (optional)             | https://alschwalm.com    |
--- a/.github/contributors/BramVanroy.md
+++ b/.github/contributors/BramVanroy.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [x] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                 |
 |------------------------------- | ----------------------|
 | Name                           | Bram Vanroy           |
 | Company name (if applicable)   |                       |
 | Title or role (if applicable)  |                       |
 | Date                           | October 19, 2018      |
 | GitHub username                | BramVanroy            |
 | Website (optional)             | https://bramvanroy.be |
--- a/.github/contributors/Cinnamy.md
+++ b/.github/contributors/Cinnamy.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Marina Lysyuk        |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 13.10.2018           |
 | GitHub username                | Cinnamy              |
 | Website (optional)             |                      |
--- a/.github/contributors/JKhakpour.md
+++ b/.github/contributors/JKhakpour.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [ ] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Ja'far Khakpour      |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 2018-09-24           |
 | GitHub username                | JKhakpour            |
 | Website (optional)             |                      |
--- a/.github/contributors/aniruddha-adhikary.md
+++ b/.github/contributors/aniruddha-adhikary.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Aniruddha Adhikary   |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 2018-09-05           |
 | GitHub username                | aniruddha-adhikary   |
 | Website (optional)             | https://adhikary.net |
--- a/.github/contributors/aongko.md
+++ b/.github/contributors/aongko.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [ ] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [x] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Andrew Ongko         |
 | Company name (if applicable)   | Kurio                |
 | Title or role (if applicable)  | Senior Data Science  |
 | Date                           | Sep 10, 2018         |
 | GitHub username                | aongko               |
 | Website (optional)             |                      |
--- a/.github/contributors/aryaprabhudesai.md
+++ b/.github/contributors/aryaprabhudesai.md
@ -0,0 +1,54 @@
 spaCy contributor agreement
 This spaCy Contributor Agreement ("SCA") is based on the Oracle Contributor Agreement. The SCA applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean ExplosionAI UG (haftungsbeschränkt). The term "you" shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested below and include the filled-in version with your first pull request, under the folder .github/contributors/. The name of the file should be your GitHub username, with the extension .md. For example, the user example_user would create the file .github/contributors/example_user.md.
 Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
 Contributor Agreement
 The term "contribution" or "contributed materials" means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project.
 With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution:
 you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements;
 you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work;
 you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees;
 you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and
 you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of your contribution.
 With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
 make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
 at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements.
 Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms.
 You covenant, represent, warrant and agree that:
 Each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this SCA;
 to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and
 each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws. You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. We may publicly disclose your participation in the project, including the fact that you have signed the SCA.
 This SCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
 Please place an “x” on one of the applicable statement below. Please do NOT mark both statements:
 [X] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions.
 I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
 Contributor Details
 Field	Entry
 Name	Arya Prabhudesai
 Company name (if applicable)	-
 Title or role (if applicable)	-
 Date	2018-08-17
 GitHub username	aryaprabhudesai
 Website (optional)	-
--- a/.github/contributors/charlax.md
+++ b/.github/contributors/charlax.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [ ] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [x] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Charles-Axel Dein                     |
 | Company name (if applicable)   | Skrib                     |
 | Title or role (if applicable)  | CEO                     |
 | Date                           | 27/09/2018                     |
 | GitHub username                | charlax                     |
 | Website (optional)             | www.dein.fr                     |
--- a/.github/contributors/cicorias.md
+++ b/.github/contributors/cicorias.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [X] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           |  Shawn Cicoria                     |
 | Company name (if applicable)   |   Microsoft                   |
 | Title or role (if applicable)  |   Principal Software Engineer                   |
 | Date                           |   November  20, 2018                  |
 | GitHub username                |     cicorias                 |
 | Website (optional)             |      www.cicoria.com                |
--- a/.github/contributors/darindf.md
+++ b/.github/contributors/darindf.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your 
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                              |
 |------------------------------- | --------------------               |
 | Name                           | Darin DeForest                     |
 | Company name (if applicable)   | Ipro Tech                          |
 | Title or role (if applicable)  | Senior Software Engineer           |
 | Date                           | 2018-09-26                         |
 | GitHub username                | darindf                            |
 | Website (optional)             |                                    |
--- a/.github/contributors/filipecaixeta.md
+++ b/.github/contributors/filipecaixeta.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your 
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Filipe Caixeta       |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 09.12.2018           |
 | GitHub username                | filipecaixeta        |
 | Website (optional)             | filipecaixeta.com.br |
--- a/.github/contributors/frascuchon.md
+++ b/.github/contributors/frascuchon.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Francisco Aranda     |
 | Company name (if applicable)   | recognai             |
 | Title or role (if applicable)  |                      |
 | Date                           |                      |
 | GitHub username                | frascuchon           |
 | Website (optional)             | https://recogn.ai    |
--- a/.github/contributors/free-variation.md
+++ b/.github/contributors/free-variation.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [ ] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           |  John Stewart        |
 | Company name (if applicable)   |  Amplify             |
 | Title or role (if applicable)  |  SVP Research        |
 | Date                           |  14/09/2018          |
 | GitHub username                |  free-variation      |
 | Website (optional)             |                      |
--- a/.github/contributors/gavrieltal.md
+++ b/.github/contributors/gavrieltal.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Gavriel Loria        |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | Nov 29, 2018         |
 | GitHub username                | gavrieltal           |
 | Website (optional)             |                      |
--- a/.github/contributors/grivaz.md
+++ b/.github/contributors/grivaz.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           |C. Grivaz                   |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           |08.22.2018                  |
 | GitHub username                |grivaz               |
 | Website (optional)             |                      |
--- a/.github/contributors/jacopofar.md
+++ b/.github/contributors/jacopofar.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [X] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           |   Jacopo Farina      |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           |  2018-10-12          |
 | GitHub username                |  jacopofar           |
 | Website (optional)             |  jacopofarina.eu     |
--- a/.github/contributors/keshan.md
+++ b/.github/contributors/keshan.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your 
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Keshan Sodimana |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | Sep 21, 2018  |
 | GitHub username                | keshan     |
 | Website (optional)             |                      |
--- a/.github/contributors/mbkupfer.md
+++ b/.github/contributors/mbkupfer.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           |  Maxim Kupfer        |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           |  Sep 6, 2018         |
 | GitHub username                |  mbkupfer            |
 | Website (optional)             |                      |
--- a/.github/contributors/mikelibg.md
+++ b/.github/contributors/mikelibg.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your 
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                    |
 |------------------------------- | ------------------------ |
 | Name                           | Michael Liberman         |
 | Company name (if applicable)   |                          |
 | Title or role (if applicable)  |                          |
 | Date                           | 2018-11-08               |
 | GitHub username                | mikelibg                 |
 | Website (optional)             |                          |
--- a/.github/contributors/mpuig.md
+++ b/.github/contributors/mpuig.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Marc Puig            |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 2018-11-17           |
 | GitHub username                | mpuig                |
 | Website (optional)             |                      |
--- a/.github/contributors/phojnacki.md
+++ b/.github/contributors/phojnacki.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [ X ] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                                 |
 |------------------------------- | ------------------------------------- |
 | Name                           | Przemysław Hojnacki                   |
 | Company name (if applicable)   |                                       |
 | Title or role (if applicable)  |                                       |
 | Date                           | 12/09/2018                            |
 | GitHub username                | phojnacki                             |
 | Website (optional)             | https://about.me/przemyslaw.hojnacki  |
--- a/.github/contributors/pzelasko.md
+++ b/.github/contributors/pzelasko.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your 
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Piotr Żelasko        |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 04-09-2018           |
 | GitHub username                | pzelasko             |
 | Website (optional)             |                      |
--- a/.github/contributors/sainathadapa.md
+++ b/.github/contributors/sainathadapa.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Sainath Adapa   |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 2018-09-06           |
 | GitHub username                | sainathadapa         |
 | Website (optional)             |                      |
--- a/.github/contributors/tyburam.md
+++ b/.github/contributors/tyburam.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [ ] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Mateusz Tybura       |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 08.09.2018           |
 | GitHub username                | tyburam              |
 | Website (optional)             |                      |
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -26,7 +26,7 @@ also check the [troubleshooting guide](https://spacy.io/usage/#troubleshooting)
 to see if your problem is already listed there.
 If you're looking for help with your code, consider posting a question on
-[StackOverflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
+[Stack Overflow](http://stackoverflow.com/questions/tagged/spacy) instead. If you
 tag it `spacy` and `python`, more people will see it and hopefully be able to
 help. Please understand that we won't be able to provide individual support via
 email. We also believe that help is much more valuable if it's **shared publicly**,
@ -186,13 +186,99 @@ sure your test passes and reference the issue in your commit message.
 ## Code conventions
 Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
-Regular line length is **80 characters**, with some tolerance for lines up to
+As of `v2.1.0`, spaCy uses [`black`](https://github.com/ambv/black) for code
-90 characters if the alternative would be worse — for instance, if your list
+formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
-comprehension comes to 82 characters, it's better not to split it over two lines.
+Python modules. If you've built spaCy from source, you'll already have both
-You can also use a linter like [`flake8`](https://pypi.python.org/pypi/flake8)
+tools installed.
-or [`frosted`](https://pypi.python.org/pypi/frosted) – just keep in mind that
+
-it won't work very well for `.pyx` files and will complain about Cython syntax
+**⚠️ Note that formatting and linting is currently only possible for Python
-like `<int*>` or `cimport`.
+modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
 ### Code formatting
 [`black`](https://github.com/ambv/black) is an opinionated Python code
 formatter, optimised to produce readable code and small diffs. You can run
 `black` from the command-line, or via your code editor. For example, if you're
 using [Visual Studio Code](https://code.visualstudio.com/), you can  add the
 following to your `settings.json` to use `black` for formatting and auto-format
 your files on save:
 ```json
 {
    "python.formatting.provider": "black",
    "[python]": {
        "editor.formatOnSave": true
    }
 }
 ```
 [See here](https://github.com/ambv/black#editor-integration) for the full
 list of available editor integrations.
 #### Disabling formatting
 There are a few cases where auto-formatting doesn't improve readability – for
 example, in some of the the language data files like the `tag_map.py`, or in
 the tests that construct `Doc` objects from lists of words and other labels.
 Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
 for that particular code. Here's an example:
 ```python
 # fmt: off
 text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
 heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
 deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
        "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
        "poss", "nsubj", "ccomp", "punct"]
 # fmt: on
 ```
 ### Code linting
 [`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code
 style. It scans one or more files and outputs errors and warnings. This feedback
 can help you stick to general standards and conventions, and can be very useful
 for spotting potential mistakes and inconsistencies in your code. The most
 important things to watch out for are syntax errors and undefined names, but you
 also want to keep an eye on unused declared variables or repeated
 (i.e. overwritten) dictionary keys. If your code was formatted with `black`
 (see above), you shouldn't see any formatting-related warnings.
 The [`.flake8`](.flake8) config defines the configuration we use for this
 codebase. For example, we're not super strict about the line length, and we're
 excluding very large files like lemmatization and tokenizer exception tables.
 Ideally, running the following command from within the repo directory should
 not return any errors or warnings:
 ```bash
 flake8 spacy
 ```
 #### Disabling linting
 Sometimes, you explicitly want to write code that's not compatible with our
 rules. For example, a module's `__init__.py` might import a function so other
 modules can import it from there, but `flake8` will complain about an unused
 import. And although it's generally discouraged, there might be cases where it
 makes sense to use a bare `except`.
 To ignore a given line, you can add a comment like `# noqa: F401`, specifying
 the code of the error or warning we want to ignore. It's also possible to
 ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. Here
 are some examples:
 ```python
 # The imported class isn't used in this file, but imported here, so it can be
 # imported *from* here by another module.
 from .submodule import SomeClass  # noqa: F401
 try:
    do_something()
 except:  # noqa: E722
    # This bare except is justified, for some specific reason
    do_something_else()
 ```
 ### Python conventions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -1,83 +0,0 @@
 # 👥 Contributors
 This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work!
 * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer)
 * Alexey Kim, [@yuukos](https://github.com/yuukos)
 * Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman)
 * Ali Zarezade, [@azarezade](https://github.com/azarezade)
 * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv)
 * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th)
 * Aniruddha Adhikary, [@aniruddha-adhikary](https://github.com/aniruddha-adhikary)
 * Anto Binish Kaspar, [@binishkaspar](https://github.com/binishkaspar)
 * Avadh Patel, [@avadhpatel](https://github.com/avadhpatel)
 * Ben Eyal, [@beneyal](https://github.com/beneyal)
 * Bhargav Srinivasa, [@bhargavvader](https://github.com/bhargavvader)
 * Bruno P. Kinoshita, [@kinow](https://github.com/kinow)
 * Canbey Bilgili, [@cbilgili](https://github.com/cbilgili)
 * Chris DuBois, [@chrisdubois](https://github.com/chrisdubois)
 * Christoph Schwienheer, [@chssch](https://github.com/chssch)
 * Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk)
 * Daniel Rapp, [@rappdw](https://github.com/rappdw)
 * Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
 * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
 * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
 * Francisco Aranda, [@frascuchon](https://github.com/frascuchon)
 * Greg Baker, [@solresol](https://github.com/solresol)
 * Greg Dubbin, [@GregDubbin](https://github.com/GregDubbin)
 * Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
 * György Orosz, [@oroszgy](https://github.com/oroszgy)
 * Henning Peters, [@henningpeters](https://github.com/henningpeters)
 * Iddo Berger, [@iddoberger](https://github.com/iddoberger)
 * Ines Montani, [@ines](https://github.com/ines)
 * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
 * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
 * Jim Geovedi, [@geovedi](https://github.com/geovedi)
 * Jim Regan, [@jimregan](https://github.com/jimregan)
 * Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG)
 * Jordan Suchow, [@suchow](https://github.com/suchow)
 * Josh Reeter, [@jreeter](https://github.com/jreeter)
 * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
 * Kendrick Tan, [@kendricktan](https://github.com/kendricktan)
 * Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson)
 * Leif Uwe Vogelsang, [@luvogels](https://github.com/luvogels)
 * Liling Tan, [@alvations](https://github.com/alvations)
 * Magnus Burton, [@magnusburton](https://github.com/magnusburton)
 * Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage)
 * Matthew Honnibal, [@honnibal](https://github.com/honnibal)
 * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
 * Michael Wallin, [@wallinm1](https://github.com/wallinm1)
 * Miguel Almeida, [@mamoit](https://github.com/mamoit)
 * Motoki Wu, [@tokestermw](https://github.com/tokestermw)
 * Ole Henrik Skogstrøm, [@ohenrik](https://github.com/ohenrik)
 * Oleg Zd, [@olegzd](https://github.com/olegzd)
 * Orhan Bilgin, [@melanuria](https://github.com/melanuria)
 * Orion Montoya, [@mdcclv](https://github.com/mdcclv)
 * Paul O'Leary McCann, [@polm](https://github.com/polm)
 * Pokey Rule, [@pokey](https://github.com/pokey)
 * Ramanan Balakrishnan, [@ramananbalakrishnan](https://github.com/ramananbalakrishnan)
 * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)
 * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort)
 * Roman Domrachev, [@ligser](https://github.com/ligser)
 * Roman Inflianskas, [@rominf](https://github.com/rominf)
 * Sam Bozek, [@sambozek](https://github.com/sambozek)
 * Sasho Savkov, [@savkov](https://github.com/savkov)
 * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon)
 * Søren Lind Kristiansen, [@sorenlind](https://github.com/sorenlind)
 * Swier, [@swierh](https://github.com/swierh)
 * Thomas Tanon, [@Tpt](https://github.com/Tpt)
 * Thomas Opsomer, [@thomasopsomer](https://github.com/thomasopsomer)
 * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)
 * Vadim Mazaev, [@GreenRiverRUS](https://github.com/GreenRiverRUS)
 * Vimos Tan, [@Vimos](https://github.com/Vimos)
 * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov)
 * Wah Loon Keng, [@kengz](https://github.com/kengz)
 * Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom)
 * Willem van Hage, [@wrvhage](https://github.com/wrvhage)
 * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
 * Yam, [@hscspring](https://github.com/hscspring)
 * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
 * Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
 * Yu-chun Huang, [@galaxyh](https://github.com/galaxyh)
 * Yubing Dong, [@tomtung](https://github.com/tomtung)
 * Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter)
--- a/bin/cythonize.py
+++ b/bin/cythonize.py
@ -35,41 +35,49 @@ import subprocess
 import argparse
-HASH_FILE = 'cythonize.json'
+HASH_FILE = "cythonize.json"
-def process_pyx(fromfile, tofile, language_level='-2'):
+def process_pyx(fromfile, tofile, language_level="-2"):
-    print('Processing %s' % fromfile)
+    print("Processing %s" % fromfile)
    try:
        from Cython.Compiler.Version import version as cython_version
        from distutils.version import LooseVersion
-        if LooseVersion(cython_version) < LooseVersion('0.19'):
+
-            raise Exception('Require Cython >= 0.19')
+        if LooseVersion(cython_version) < LooseVersion("0.19"):
            raise Exception("Require Cython >= 0.19")
    except ImportError:
        pass
-    flags = ['--fast-fail', language_level]
+    flags = ["--fast-fail", language_level]
-    if tofile.endswith('.cpp'):
+    if tofile.endswith(".cpp"):
-        flags += ['--cplus']
+        flags += ["--cplus"]
    try:
        try:
-            r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile],
+            r = subprocess.call(
-                                env=os.environ) # See Issue #791
+                ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
            )  # See Issue #791
            if r != 0:
-                raise Exception('Cython failed')
+                raise Exception("Cython failed")
        except OSError:
            # There are ways of installing Cython that don't result in a cython
            # executable on the path, see gh-2397.
-            r = subprocess.call([sys.executable, '-c',
+            r = subprocess.call(
-                                'import sys; from Cython.Compiler.Main import '
+                [
-                                'setuptools_main as main; sys.exit(main())'] + flags +
+                    sys.executable,
-                                ['-o', tofile, fromfile])
+                    "-c",
                    "import sys; from Cython.Compiler.Main import "
                    "setuptools_main as main; sys.exit(main())",
                ]
                + flags
                + ["-o", tofile, fromfile]
            )
            if r != 0:
-                raise Exception('Cython failed')
+                raise Exception("Cython failed")
    except OSError:
-        raise OSError('Cython needs to be installed')
+        raise OSError("Cython needs to be installed")
 def preserve_cwd(path, func, *args):
@ -89,12 +97,12 @@ def load_hashes(filename):
 def save_hashes(hash_db, filename):
-    with open(filename, 'w') as f:
+    with open(filename, "w") as f:
        f.write(json.dumps(hash_db))
 def get_hash(path):
-    return hashlib.md5(open(path, 'rb').read()).hexdigest()
+    return hashlib.md5(open(path, "rb").read()).hexdigest()
 def hash_changed(base, path, db):
@ -109,25 +117,27 @@ def hash_add(base, path, db):
 def process(base, filename, db):
    root, ext = os.path.splitext(filename)
-    if ext in ['.pyx', '.cpp']:
+    if ext in [".pyx", ".cpp"]:
-        if hash_changed(base, filename, db) or not os.path.isfile(os.path.join(base, root + '.cpp')):
+        if hash_changed(base, filename, db) or not os.path.isfile(
-            preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
+            os.path.join(base, root + ".cpp")
-            hash_add(base, root + '.cpp', db)
+        ):
-            hash_add(base, root + '.pyx', db)
+            preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
            hash_add(base, root + ".cpp", db)
            hash_add(base, root + ".pyx", db)
 def check_changes(root, db):
    res = False
    new_db = {}
-    setup_filename = 'setup.py'
+    setup_filename = "setup.py"
-    hash_add('.', setup_filename, new_db)
+    hash_add(".", setup_filename, new_db)
-    if hash_changed('.', setup_filename, db):
+    if hash_changed(".", setup_filename, db):
        res = True
    for base, _, files in os.walk(root):
        for filename in files:
-            if filename.endswith('.pxd'):
+            if filename.endswith(".pxd"):
                hash_add(base, filename, new_db)
                if hash_changed(base, filename, db):
                    res = True
@ -150,8 +160,10 @@ def run(root):
        save_hashes(db, HASH_FILE)
-if __name__ == '__main__':
+if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
+    parser = argparse.ArgumentParser(
-    parser.add_argument('root', help='root directory')
+        description="Cythonize pyx files into C++ files as needed"
    )
    parser.add_argument("root", help="root directory")
    args = parser.parse_args()
    run(args.root)
--- a/bin/load_reddit.py
+++ b/bin/load_reddit.py
@ -15,12 +15,13 @@ _unset = object()
 class Reddit(object):
    """Stream cleaned comments from Reddit."""
    pre_format_re = re.compile(r'^[\`\*\~]')
    post_format_re = re.compile(r'[\`\*\~]$')
    url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
    link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
-    def __init__(self, file_path, meta_keys={'subreddit': 'section'}):
+    pre_format_re = re.compile(r"^[\`\*\~]")
    post_format_re = re.compile(r"[\`\*\~]$")
    url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
    link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
    def __init__(self, file_path, meta_keys={"subreddit": "section"}):
        """
        file_path (unicode / Path): Path to archive or directory of archives.
        meta_keys (dict): Meta data key included in the Reddit corpus, mapped
@ -45,28 +46,30 @@ class Reddit(object):
                        continue
                    comment = ujson.loads(line)
                    if self.is_valid(comment):
-                        text = self.strip_tags(comment['body'])
+                        text = self.strip_tags(comment["body"])
-                        yield {'text': text}
+                        yield {"text": text}
    def get_meta(self, item):
-        return {name: item.get(key, 'n/a') for key, name in self.meta.items()}
+        return {name: item.get(key, "n/a") for key, name in self.meta.items()}
    def iter_files(self):
        for file_path in self.files:
            yield file_path
    def strip_tags(self, text):
-        text = self.link_re.sub(r'\1', text)
+        text = self.link_re.sub(r"\1", text)
-        text = text.replace('&gt;', '>').replace('&lt;', '<')
+        text = text.replace("&gt;", ">").replace("&lt;", "<")
-        text = self.pre_format_re.sub('', text)
+        text = self.pre_format_re.sub("", text)
-        text = self.post_format_re.sub('', text)
+        text = self.post_format_re.sub("", text)
-        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r"\s+", " ", text)
        return text.strip()
    def is_valid(self, comment):
-        return comment['body'] is not None \
+        return (
-            and comment['body'] != '[deleted]' \
+            comment["body"] is not None
-            and comment['body'] != '[removed]'
+            and comment["body"] != "[deleted]"
            and comment["body"] != "[removed]"
        )
 def main(path):
@ -75,16 +78,18 @@ def main(path):
        print(ujson.dumps(comment))
-if __name__ == '__main__':
+if __name__ == "__main__":
    import socket
    try:
        BrokenPipeError
    except NameError:
        BrokenPipeError = socket.error
    try:
        plac.call(main)
-    except BrokenPipeError: 
+    except BrokenPipeError:
        import os, sys
        # Python flushes standard streams on exit; redirect remaining output
        # to devnull to avoid another BrokenPipeError at shutdown
        devnull = os.open(os.devnull, os.O_WRONLY)
--- a/bin/push-tag.sh
+++ b/bin/push-tag.sh
@ -7,6 +7,7 @@ git diff-index --quiet HEAD
 git checkout $1
 git pull origin $1
 version=$(grep "__version__ = " spacy/about.py)
 version=${version/__version__ = }
 version=${version/\'/}
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@ -92,11 +92,13 @@ def get_features(docs, max_length):
 def train(train_texts, train_labels, dev_texts, dev_labels,
          lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
          nb_epoch=5, by_sentence=True):
    print("Loading spaCy")
    nlp = spacy.load('en_vectors_web_lg')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    embeddings = get_embeddings(nlp.vocab)
    model = compile_lstm(embeddings, lstm_shape, lstm_settings)
    print("Parsing texts...")
    train_docs = list(nlp.pipe(train_texts))
    dev_docs = list(nlp.pipe(dev_texts))
@ -107,7 +109,7 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
    train_X = get_features(train_docs, lstm_shape['max_length'])
    dev_X = get_features(dev_docs, lstm_shape['max_length'])
    model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
-              nb_epoch=nb_epoch, batch_size=batch_size)
+              epochs=nb_epoch, batch_size=batch_size)
    return model
@ -138,15 +140,9 @@ def get_embeddings(vocab):
 def evaluate(model_dir, texts, labels, max_length=100):
-    def create_pipeline(nlp):
+    nlp = spacy.load('en_vectors_web_lg')
-        '''
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
-        This could be a lambda, but named functions are easier to read in Python.
+    nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
        '''
        return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
                                                               max_length=max_length)]
    nlp = spacy.load('en')
    nlp.pipeline = create_pipeline(nlp)
    correct = 0
    i = 0
@ -186,7 +182,7 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
         is_runtime=False,
         nr_hidden=64, max_length=100, # Shape
         dropout=0.5, learn_rate=0.001, # General NN config
-         nb_epoch=5, batch_size=100, nr_examples=-1):  # Training params
+         nb_epoch=5, batch_size=256, nr_examples=-1):  # Training params
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    if train_dir is None or dev_dir is None:
@ -219,7 +215,7 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
        if model_dir is not None:
            with (model_dir / 'model').open('wb') as file_:
                pickle.dump(weights[1:], file_)
-            with (model_dir / 'config.json').open('wb') as file_:
+            with (model_dir / 'config.json').open('w') as file_:
                file_.write(lstm.to_json())
--- a/examples/keras_parikh_entailment/README.md
+++ b/examples/keras_parikh_entailment/README.md
@ -2,11 +2,7 @@
 # A decomposable attention model for Natural Language Inference
 **by Matthew Honnibal, [@honnibal](https://github.com/honnibal)**
-
+**Updated for spaCy 2.0+ and Keras 2.2.2+ by John Stewart, [@free-variation](https://github.com/free-variation)**
 > ⚠️ **IMPORTANT NOTE:** This example is currently only compatible with spaCy
 > v1.x. We're working on porting the example over to Keras v2.x and spaCy v2.x.
 > See [#1445](https://github.com/explosion/spaCy/issues/1445) for details –
 > contributions welcome!
 This directory contains an implementation of the entailment prediction model described
 by [Parikh et al. (2016)](https://arxiv.org/pdf/1606.01933.pdf). The model is notable
@ -21,19 +17,25 @@ hook is installed to customise the `.similarity()` method of spaCy's `Doc`
 and `Span` objects:
 ```python
-def demo(model_dir):
+def demo(shape):
-    nlp = spacy.load('en', path=model_dir,
+	nlp = spacy.load('en_vectors_web_lg')
-            create_pipeline=create_similarity_pipeline)
+    nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
-    doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
+
-    doc2 = nlp(u'The milkshakes are good. The fries are bad.')
+    doc1 = nlp(u'The king of France is bald.')
-    print(doc1.similarity(doc2))
+    doc2 = nlp(u'France has no king.')
-    sent1a, sent1b = doc1.sents
+
-    print(sent1a.similarity(sent1b))
+    print("Sentence 1:", doc1)
-    print(sent1a.similarity(doc2))
+    print("Sentence 2:", doc2)
-    print(sent1b.similarity(doc2))
+
    entailment_type, confidence = doc1.similarity(doc2)
    print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
 ```
 Which gives the output `Entailment type: contradiction (Confidence: 0.60604566)`, showing that
 the system has definite opinions about Betrand Russell's [famous conundrum](https://users.drew.edu/jlenz/br-on-denoting.html)!
 I'm working on a blog post to explain Parikh et al.'s model in more detail.
 A [notebook](https://github.com/free-variation/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb) is available that briefly explains this implementation.
 I think it is a very interesting example of the attention mechanism, which
 I didn't understand very well before working through this paper. There are
 lots of ways to extend the model.
@ -43,7 +45,7 @@ lots of ways to extend the model.
 | File | Description |
 | --- | --- |
 | `__main__.py` | The script that will be executed. Defines the CLI, the data reading, etc — all the boring stuff. |
-| `spacy_hook.py` | Provides a class `SimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
+| `spacy_hook.py` | Provides a class `KerasSimilarityShim` that lets you use an arbitrary function to customize spaCy's `doc.similarity()` method. Instead of the default average-of-vectors algorithm, when you call `doc1.similarity(doc2)`, you'll get the result of `your_model(doc1, doc2)`. |
 | `keras_decomposable_attention.py` | Defines the neural network model. |
 ## Setting up
@ -52,17 +54,13 @@ First, install [Keras](https://keras.io/), [spaCy](https://spacy.io) and the spa
 English models (about 1GB of data):
 ```bash
-pip install https://github.com/fchollet/keras/archive/1.2.2.zip
+pip install keras
 pip install spacy
-python -m spacy.en.download
+python -m spacy download en_vectors_web_lg
 ```
-⚠️ **Important:** In order for the example to run, you'll need to install Keras from
+You'll also want to get Keras working on your GPU, and you will need a backend, such as TensorFlow or Theano.
-the 1.2.2 release (and not via `pip install keras`). For more info on this, see
+This will depend on your set up, so you're mostly on your own for this step. If you're using AWS, try the
 [#727](https://github.com/explosion/spaCy/issues/727).
 You'll also want to get Keras working on your GPU. This will depend on your
 set up, so you're mostly on your own for this step. If you're using AWS, try the
 [NVidia AMI](https://aws.amazon.com/marketplace/pp/B00FYCDDTE). It made things pretty easy.
 Once you've installed the dependencies, you can run a small preliminary test of
@ -80,22 +78,35 @@ Finally, download the [Stanford Natural Language Inference corpus](http://nlp.st
 ## Running the example
 You can run the `keras_parikh_entailment/` directory as a script, which executes the file
-[`keras_parikh_entailment/__main__.py`](__main__.py). The first thing you'll want to do is train the model:
+[`keras_parikh_entailment/__main__.py`](__main__.py).  If you run the script without arguments
 the usage is shown.  Running it with `-h` explains the command line arguments.
 The first thing you'll want to do is train the model:
 ```bash
-python keras_parikh_entailment/ train <train_directory> <dev_directory>
+python keras_parikh_entailment/ train -t <path to SNLI train JSON> -s <path to SNLI dev JSON>
 ```
 Training takes about 300 epochs for full accuracy, and I haven't rerun the full
 experiment since refactoring things to publish this example — please let me
-know if I've broken something. You should get to at least 85% on the development data.
+know if I've broken something. You should get to at least 85% on the development data even after 10-15 epochs.
 The other two modes demonstrate run-time usage. I never like relying on the accuracy printed
 by `.fit()` methods. I never really feel confident until I've run a new process that loads
 the model and starts making predictions, without access to the gold labels. I've therefore
-included an `evaluate` mode. Finally, there's also a little demo, which mostly exists to show
+included an `evaluate` mode. 
 ```bash
 python keras_parikh_entailment/ evaluate -s <path to SNLI train JSON>
 ```
 Finally, there's also a little demo, which mostly exists to show
 you how run-time usage will eventually look.
 ```bash
 python keras_parikh_entailment/ demo
 ```
 ## Getting updates
 We should have the blog post explaining the model ready before the end of the week. To get
--- a/examples/keras_parikh_entailment/main.py
+++ b/examples/keras_parikh_entailment/main.py
@ -1,82 +1,104 @@
-from __future__ import division, unicode_literals, print_function
+import numpy as np
 import spacy
 import plac
 from pathlib import Path
 import ujson as json
-import numpy
+from keras.utils import to_categorical
-from keras.utils.np_utils import to_categorical
+import plac
-
+import sys
 from spacy_hook import get_embeddings, get_word_ids
 from spacy_hook import create_similarity_pipeline
 from keras_decomposable_attention import build_model
 from spacy_hook import get_embeddings, KerasSimilarityShim
 try:
    import cPickle as pickle
 except ImportError:
    import pickle
 import spacy
 # workaround for keras/tensorflow bug
 # see https://github.com/tensorflow/tensorflow/issues/3388
 import os
 import importlib
 from keras import backend as K
 def set_keras_backend(backend):
    if K.backend() != backend:
        os.environ['KERAS_BACKEND'] = backend
        importlib.reload(K)
        assert K.backend() == backend
    if backend == "tensorflow":
        K.get_session().close()
        cfg = K.tf.ConfigProto()
        cfg.gpu_options.allow_growth = True
        K.set_session(K.tf.Session(config=cfg))
        K.clear_session()
 set_keras_backend("tensorflow") 
 def train(train_loc, dev_loc, shape, settings):
    train_texts1, train_texts2, train_labels = read_snli(train_loc)
    dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
    print("Loading spaCy")
-    nlp = spacy.load('en')
+    nlp = spacy.load('en_vectors_web_lg')
    assert nlp.path is not None
    print("Processing texts...")
    train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
    dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
    print("Compiling network")
    model = build_model(get_embeddings(nlp.vocab), shape, settings)
-    print("Processing texts...")
+
    Xs = []
    for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2):
        Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    train_X1, train_X2, dev_X1, dev_X2 = Xs
    print(settings)
    model.fit(
-        [train_X1, train_X2],
+        train_X,
        train_labels,
-        validation_data=([dev_X1, dev_X2], dev_labels),
+        validation_data = (dev_X, dev_labels),
-        nb_epoch=settings['nr_epoch'],
+        epochs = settings['nr_epoch'],
-        batch_size=settings['batch_size'])
+        batch_size = settings['batch_size'])
    if not (nlp.path / 'similarity').exists():
        (nlp.path / 'similarity').mkdir()
    print("Saving to", nlp.path / 'similarity')
    weights = model.get_weights()
    # remove the embedding matrix.  We can reconstruct it.
    del weights[1]
    with (nlp.path / 'similarity' / 'model').open('wb') as file_:
-        pickle.dump(weights[1:], file_)
+        pickle.dump(weights, file_)
-    with (nlp.path / 'similarity' / 'config.json').open('wb') as file_:
+    with (nlp.path / 'similarity' / 'config.json').open('w') as file_:
        file_.write(model.to_json())
-def evaluate(dev_loc):
+def evaluate(dev_loc, shape):
    dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
-    nlp = spacy.load('en',
+    nlp = spacy.load('en_vectors_web_lg')
-            create_pipeline=create_similarity_pipeline)
+    nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
    total = 0.
    correct = 0.
    for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
        doc1 = nlp(text1)
        doc2 = nlp(text2)
-        sim = doc1.similarity(doc2)
+        sim, _ = doc1.similarity(doc2)
-        if sim.argmax() == label.argmax():
+        if sim == KerasSimilarityShim.entailment_types[label.argmax()]:
            correct += 1
        total += 1
    return correct, total
-def demo():
+def demo(shape):
-    nlp = spacy.load('en',
+    nlp = spacy.load('en_vectors_web_lg')
-            create_pipeline=create_similarity_pipeline)
+    nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
-    doc1 = nlp(u'What were the best crime fiction books in 2016?')
+
-    doc2 = nlp(
+    doc1 = nlp(u'The king of France is bald.')
-        u'What should I read that was published last year? I like crime stories.')
+    doc2 = nlp(u'France has no king.')
-    print(doc1)
+
-    print(doc2)
+    print("Sentence 1:", doc1)
-    print("Similarity", doc1.similarity(doc2))
+    print("Sentence 2:", doc2)
    entailment_type, confidence = doc1.similarity(doc2)
    print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
 LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
@ -84,56 +106,92 @@ def read_snli(path):
    texts1 = []
    texts2 = []
    labels = []
-    with path.open() as file_:
+    with open(path, 'r') as file_:
        for line in file_:
            eg = json.loads(line)
            label = eg['gold_label']
-            if label == '-':
+            if label == '-':  # per Parikh, ignore - SNLI entries
                continue
            texts1.append(eg['sentence1'])
            texts2.append(eg['sentence2'])
            labels.append(LABELS[label])
-    return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))
+    return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))
 def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
    sents = texts + hypotheses
    sents_as_ids = []
    for sent in sents:
        doc = nlp(sent)
        word_ids = []
        for i, token in enumerate(doc):
            # skip odd spaces from tokenizer
            if token.has_vector and token.vector_norm == 0:
                continue
            if i > max_length:
                break
            if token.has_vector:
                word_ids.append(token.rank + num_unk + 1)
            else:
                # if we don't have a vector, pick an OOV entry
                word_ids.append(token.rank % num_unk + 1) 
        # there must be a simpler way of generating padded arrays from lists...
        word_id_vec = np.zeros((max_length), dtype='int')
        clipped_len = min(max_length, len(word_ids))
        word_id_vec[:clipped_len] = word_ids[:clipped_len]
        sents_as_ids.append(word_id_vec)
    return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
@plac.annotations(
    mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
-    train_loc=("Path to training data", "positional", None, Path),
+    train_loc=("Path to training data", "option", "t", str),
-    dev_loc=("Path to development data", "positional", None, Path),
+    dev_loc=("Path to development or test data", "option", "s", str),
    max_length=("Length to truncate sentences", "option", "L", int),
    nr_hidden=("Number of hidden units", "option", "H", int),
    dropout=("Dropout level", "option", "d", float),
-    learn_rate=("Learning rate", "option", "e", float),
+    learn_rate=("Learning rate", "option", "r", float),
    batch_size=("Batch size for neural network training", "option", "b", int),
-    nr_epoch=("Number of training epochs", "option", "i", int),
+    nr_epoch=("Number of training epochs", "option", "e", int),
-    tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
+    entail_dir=("Direction of entailment", "option", "D", str, ["both", "left", "right"])
    gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),
 )
 def main(mode, train_loc, dev_loc,
-        tree_truncate=False,
+        max_length = 50,
-        gru_encode=False,
+        nr_hidden = 200,
-        max_length=100,
+        dropout = 0.2,
-        nr_hidden=100,
+        learn_rate = 0.001,
-        dropout=0.2,
+        batch_size = 1024,
-        learn_rate=0.001,
+        nr_epoch = 10,
-        batch_size=100,
+        entail_dir="both"):
-        nr_epoch=5):
+    
    shape = (max_length, nr_hidden, 3)
    settings = {
        'lr': learn_rate,
        'dropout': dropout,
        'batch_size': batch_size,
        'nr_epoch': nr_epoch,
-        'tree_truncate': tree_truncate,
+        'entail_dir': entail_dir
        'gru_encode': gru_encode
    }
    if mode == 'train':
        if train_loc == None or dev_loc == None:
            print("Train mode requires paths to training and development data sets.")
            sys.exit(1)
        train(train_loc, dev_loc, shape, settings)
    elif mode == 'evaluate':
-        correct, total = evaluate(dev_loc)
+        if  dev_loc == None:
            print("Evaluate mode requires paths to test data set.")
            sys.exit(1)
        correct, total = evaluate(dev_loc, shape)
        print(correct, '/', total, correct / total)
    else:
-        demo()
+        demo(shape)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/keras_parikh_entailment/keras_decomposable_attention.py
+++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py
@ -1,259 +1,137 @@
-# Semantic similarity with decomposable attention (using spaCy and Keras)
+# Semantic entailment/similarity with decomposable attention (using spaCy and Keras)
-# Practical state-of-the-art text similarity with spaCy and Keras
+# Practical state-of-the-art textual entailment with spaCy and Keras
 import numpy
 from keras.layers import InputSpec, Layer, Input, Dense, merge
 from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
 from keras.layers import Bidirectional, GRU, LSTM
 from keras.layers.noise import GaussianNoise
 from keras.layers.advanced_activations import ELU
 import keras.backend as K
 from keras.models import Sequential, Model, model_from_json
 from keras.regularizers import l2
 from keras.optimizers import Adam
 from keras.layers.normalization import BatchNormalization
 from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
 from keras.layers import Merge
 import numpy as np
 from keras import layers, Model, models, optimizers
 from keras import backend as K
 def build_model(vectors, shape, settings):
    '''Compile the model.'''
    max_length, nr_hidden, nr_class = shape
    # Declare inputs.
    ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
    ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
-    # Construct operations, which we'll chain together.
+    input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
-    embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000)
+    input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
-    if settings['gru_encode']:
+    
-        encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
+    # embeddings (projected)
-    attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
+    embed = create_embedding(vectors, max_length, nr_hidden)
-    align = _SoftAlignment(max_length, nr_hidden)
+   
-    compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
+    a = embed(input1)
-    entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout'])
+    b = embed(input2)
    # step 1: attend
    F = create_feedforward(nr_hidden)
    att_weights = layers.dot([F(a), F(b)], axes=-1)
    G = create_feedforward(nr_hidden)
    if settings['entail_dir'] == 'both':
        norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
        norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
        alpha = layers.dot([norm_weights_a, a], axes=1)
        beta  = layers.dot([norm_weights_b, b], axes=1)
-    # Declare the model as a computational graph.
+        # step 2: compare
-    sent1 = embed(ids1) # Shape: (i, n)
+        comp1 = layers.concatenate([a, beta])
-    sent2 = embed(ids2) # Shape: (j, n)
+        comp2 = layers.concatenate([b, alpha])
        v1 = layers.TimeDistributed(G)(comp1)
        v2 = layers.TimeDistributed(G)(comp2)
-    if settings['gru_encode']:
+        # step 3: aggregate
-        sent1 = encode(sent1)
+        v1_sum = layers.Lambda(sum_word)(v1)
-        sent2 = encode(sent2)
+        v2_sum = layers.Lambda(sum_word)(v2)
        concat = layers.concatenate([v1_sum, v2_sum])
-    attention = attend(sent1, sent2)  # Shape: (i, j)
+    elif settings['entail_dir'] == 'left':
        norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
        alpha = layers.dot([norm_weights_a, a], axes=1)
        comp2 = layers.concatenate([b, alpha])
        v2 = layers.TimeDistributed(G)(comp2)
        v2_sum = layers.Lambda(sum_word)(v2)
        concat = v2_sum
-    align1 = align(sent2, attention)
+    else:
-    align2 = align(sent1, attention, transpose=True)
+        norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
-
+        beta  = layers.dot([norm_weights_b, b], axes=1)
-    feats1 = compare(sent1, align1)
+        comp1 = layers.concatenate([a, beta])
-    feats2 = compare(sent2, align2)
+        v1 = layers.TimeDistributed(G)(comp1)
-
+        v1_sum = layers.Lambda(sum_word)(v1)
-    scores = entail(feats1, feats2)
+        concat = v1_sum
-
+    
-    # Now that we have the input/output, we can construct the Model object...
+    H = create_feedforward(nr_hidden)
-    model = Model(input=[ids1, ids2], output=[scores])
+    out = H(concat)
-
+    out = layers.Dense(nr_class, activation='softmax')(out)
-    # ...Compile it...
+    
    model = Model([input1, input2], out)
    model.compile(
-        optimizer=Adam(lr=settings['lr']),
+        optimizer=optimizers.Adam(lr=settings['lr']),
        loss='categorical_crossentropy',
        metrics=['accuracy'])
-    # ...And return it for training.
+    
    return model
-class _StaticEmbedding(object):
+def create_embedding(vectors, max_length, projected_dim):
-    def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0):
+    return models.Sequential([
-        self.nr_out = nr_out
+        layers.Embedding(
-        self.max_length = max_length
+            vectors.shape[0],
-        self.embed = Embedding(
+            vectors.shape[1],
-                        vectors.shape[0],
+            input_length=max_length,
-                        vectors.shape[1],
+            weights=[vectors],
-                        input_length=max_length,
+            trainable=False),
-                        weights=[vectors],
+        
-                        name='embed',
+        layers.TimeDistributed(
-                        trainable=False)
+            layers.Dense(projected_dim,
-        self.tune = Embedding(
+                         activation=None,
-                        nr_tune,
+                         use_bias=False))
-                        nr_out,
+    ])
                        input_length=max_length,
                        weights=None,
                        name='tune',
                        trainable=True,
                        dropout=dropout)
        self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1,
                              output_shape=(self.max_length,))
-        self.project = TimeDistributed(
+def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):
-                            Dense(
+    return models.Sequential([
-                                nr_out,
+        layers.Dense(num_units, activation=activation),
-                                activation=None,
+        layers.Dropout(dropout_rate),
-                                bias=False,
+        layers.Dense(num_units, activation=activation),
-                                name='project'))
+        layers.Dropout(dropout_rate)
-
+    ])
    def __call__(self, sentence):
        def get_output_shape(shapes):
            print(shapes)
            return shapes[0]
        mod_sent = self.mod_ids(sentence)
        tuning = self.tune(mod_sent)
        #tuning = merge([tuning, mod_sent],
        #    mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)),
        #    output_shape=(self.max_length, self.nr_out))
        pretrained = self.project(self.embed(sentence))
        vectors = merge([pretrained, tuning], mode='sum')
        return vectors
-class _BiRNNEncoding(object):
+def normalizer(axis):
-    def __init__(self, max_length, nr_out, dropout=0.0):
+    def _normalize(att_weights):
-        self.model = Sequential()
+        exp_weights = K.exp(att_weights)
-        self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True,
+        sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)
-                                         dropout_W=dropout, dropout_U=dropout),
+        return exp_weights/sum_weights
-                                         input_shape=(max_length, nr_out)))
+    return _normalize
        self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
        self.model.add(TimeDistributed(Dropout(0.2)))
-    def __call__(self, sentence):
+def sum_word(x):
-        return self.model(sentence)
+    return K.sum(x, axis=1)
 class _Attention(object):
    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
        self.max_length = max_length
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
        self.model.add(
            Dense(nr_hidden, name='attend1',
                init='he_normal', W_regularizer=l2(L2),
                input_shape=(nr_hidden,), activation='relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='attend2',
            init='he_normal', W_regularizer=l2(L2), activation='relu'))
        self.model = TimeDistributed(self.model)
    def __call__(self, sent1, sent2):
        def _outer(AB):
            att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
            return K.permute_dimensions(att_ji,(0, 2, 1))
        return merge(
                [self.model(sent1), self.model(sent2)],
                mode=_outer,
                output_shape=(self.max_length, self.max_length))
 class _SoftAlignment(object):
    def __init__(self, max_length, nr_hidden):
        self.max_length = max_length
        self.nr_hidden = nr_hidden
    def __call__(self, sentence, attention, transpose=False):
        def _normalize_attention(attmat):
            att = attmat[0]
            mat = attmat[1]
            if transpose:
                att = K.permute_dimensions(att,(0, 2, 1))
            # 3d softmax
            e = K.exp(att - K.max(att, axis=-1, keepdims=True))
            s = K.sum(e, axis=-1, keepdims=True)
            sm_att = e / s
            return K.batch_dot(sm_att, mat)
        return merge([attention, sentence], mode=_normalize_attention,
                      output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)
 class _Comparison(object):
    def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
        self.words = words
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='compare1',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='compare2',
                        W_regularizer=l2(L2), init='he_normal'))
        self.model.add(Activation('relu'))
        self.model = TimeDistributed(self.model)
    def __call__(self, sent, align, **kwargs):
        result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
        avged = GlobalAveragePooling1D()(result, mask=self.words)
        maxed = GlobalMaxPooling1D()(result, mask=self.words)
        merged = merge([avged, maxed])
        result = BatchNormalization()(merged)
        return result
 class _Entailment(object):
    def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0):
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='entail1',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='entail2',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
                        W_regularizer=l2(L2), init='zero'))
    def __call__(self, feats1, feats2):
        features = merge([feats1, feats2], mode='concat')
        return self.model(features)
 class _GlobalSumPooling1D(Layer):
    '''Global sum pooling operation for temporal data.
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    '''
    def __init__(self, **kwargs):
        super(_GlobalSumPooling1D, self).__init__(**kwargs)
        self.input_spec = [InputSpec(ndim=3)]
    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[2])
    def call(self, x, mask=None):
        if mask is not None:
            return K.sum(x * K.clip(mask, 0, 1), axis=1)
        else:
            return K.sum(x, axis=1)
 def test_build_model():
-    vectors = numpy.ndarray((100, 8), dtype='float32')
+    vectors = np.ndarray((100, 8), dtype='float32')
    shape = (10, 16, 3)
-    settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
+    settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True, 'entail_dir':'both'}
    model = build_model(vectors, shape, settings)
 def test_fit_model():
    def _generate_X(nr_example, length, nr_vector):
-        X1 = numpy.ndarray((nr_example, length), dtype='int32')
+        X1 = np.ndarray((nr_example, length), dtype='int32')
        X1 *= X1 < nr_vector
        X1 *= 0 <= X1
-        X2 = numpy.ndarray((nr_example, length), dtype='int32')
+        X2 = np.ndarray((nr_example, length), dtype='int32')
        X2 *= X2 < nr_vector
        X2 *= 0 <= X2
        return [X1, X2]
    def _generate_Y(nr_example, nr_class):
-        ys = numpy.zeros((nr_example, nr_class), dtype='int32')
+        ys = np.zeros((nr_example, nr_class), dtype='int32')
        for i in range(nr_example):
            ys[i, i % nr_class] = 1
        return ys
-    vectors = numpy.ndarray((100, 8), dtype='float32')
+    vectors = np.ndarray((100, 8), dtype='float32')
    shape = (10, 16, 3)
-    settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True}
+    settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True, 'entail_dir':'both'}
    model = build_model(vectors, shape, settings)
    train_X = _generate_X(20, shape[0], vectors.shape[0])
@ -261,8 +139,7 @@ def test_fit_model():
    dev_X = _generate_X(15, shape[0], vectors.shape[0])
    dev_Y = _generate_Y(15, shape[2])
-    model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
+    model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4)
              batch_size=4)
 __all__ = [build_model]
--- a/examples/keras_parikh_entailment/spacy_hook.py
+++ b/examples/keras_parikh_entailment/spacy_hook.py
@ -1,8 +1,5 @@
 import numpy as np
 from keras.models import model_from_json
 import numpy
 import numpy.random
 import json
 from spacy.tokens.span import Span
 try:
    import cPickle as pickle
@ -11,16 +8,23 @@ except ImportError:
 class KerasSimilarityShim(object):
    entailment_types = ["entailment", "contradiction", "neutral"]
    @classmethod
-    def load(cls, path, nlp, get_features=None, max_length=100):
+    def load(cls, path, nlp, max_length=100, get_features=None):
        if get_features is None:
            get_features = get_word_ids
        with (path / 'config.json').open() as file_:
            model = model_from_json(file_.read())
        with (path / 'model').open('rb') as file_:
            weights = pickle.load(file_)
        embeddings = get_embeddings(nlp.vocab)
-        model.set_weights([embeddings] + weights)
+        weights.insert(1, embeddings)
        model.set_weights(weights)
        return cls(model, get_features=get_features, max_length=max_length)
    def __init__(self, model, get_features=None, max_length=100):
@ -32,58 +36,42 @@ class KerasSimilarityShim(object):
        doc.user_hooks['similarity'] = self.predict
        doc.user_span_hooks['similarity'] = self.predict
        return doc
    def predict(self, doc1, doc2):
-        x1 = self.get_features([doc1], max_length=self.max_length, tree_truncate=True)
+        x1 = self.get_features([doc1], max_length=self.max_length)
-        x2 = self.get_features([doc2], max_length=self.max_length, tree_truncate=True)
+        x2 = self.get_features([doc2], max_length=self.max_length)
        scores = self.model.predict([x1, x2])
-        return scores[0]
+
        return self.entailment_types[scores.argmax()], scores.max()
 def get_embeddings(vocab, nr_unk=100):
-    nr_vector = max(lex.rank for lex in vocab) + 1
+    # the extra +1 is for a zero vector representing sentence-final padding
-    vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
+    num_vectors = max(lex.rank for lex in vocab) + 2 
    # create random vectors for OOV tokens
    oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
    oov = oov / oov.sum(axis=1, keepdims=True)
    vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32')
    vectors[1:(nr_unk + 1), ] = oov
    for lex in vocab:
-        if lex.has_vector:
+        if lex.has_vector and lex.vector_norm > 0:
-            vectors[lex.rank+1] = lex.vector / lex.vector_norm
+            vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm 
    return vectors
-def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
+def get_word_ids(docs, max_length=100, nr_unk=100):
-    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
+    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
-        if tree_truncate:
+        for j, token in enumerate(doc):
-            if isinstance(doc, Span):
+            if j == max_length:
                queue = [doc.root]
            else:
                queue = [sent.root for sent in doc.sents]
        else:
            queue = list(doc)
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
            if rnn_encode or (not word.is_punct and not word.is_space):
                words.append(word)
            if tree_truncate:
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
            if token.has_vector:
                Xs[i, j] = token.rank+1
            else:
                Xs[i, j] = (token.shape % (nr_unk-1))+2
            j += 1
            if j >= max_length:
                break
-        else:
+            if token.has_vector:
-            Xs[i, len(words)] = 1
+                Xs[i, j] = token.rank + nr_unk + 1
            else:
                Xs[i, j] = token.rank % nr_unk + 1
    return Xs
 def create_similarity_pipeline(nlp, max_length=100):
    return [
        nlp.tagger,
        nlp.entity,
        nlp.parser,
        KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length)
    ]
--- a/examples/notebooks/Decompositional
+++ b/examples/notebooks/Decompositional
@ -0,0 +1,955 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Natural language inference using spaCy and Keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook details an implementation of the natural language inference model presented in [(Parikh et al, 2016)](https://arxiv.org/abs/1606.01933).  The model is notable for the small number of paramaters *and hyperparameters* it specifices, while still yielding good performance."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Constructing the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We only need the GloVe vectors from spaCy, not a full NLP pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp = spacy.load('en_vectors_web_lg')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Function to load the SNLI dataset.  The categories are converted to one-shot representation.  The function comes from an example in spaCy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jds/tensorflow-gpu/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import ujson as json\n",
    "from keras.utils import to_categorical\n",
    "\n",
    "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
    "def read_snli(path):\n",
    "    texts1 = []\n",
    "    texts2 = []\n",
    "    labels = []\n",
    "    with open(path, 'r') as file_:\n",
    "        for line in file_:\n",
    "            eg = json.loads(line)\n",
    "            label = eg['gold_label']\n",
    "            if label == '-':  # per Parikh, ignore - SNLI entries\n",
    "                continue\n",
    "            texts1.append(eg['sentence1'])\n",
    "            texts2.append(eg['sentence2'])\n",
    "            labels.append(LABELS[label])\n",
    "    return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because Keras can do the train/test split for us, we'll load *all* SNLI triples from one file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts,hypotheses,labels = read_snli('snli/snli_1.0_train.jsonl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):\n",
    "    sents = texts + hypotheses\n",
    "    \n",
    "    # the extra +1 is for a zero vector represting NULL for padding\n",
    "    num_vectors = max(lex.rank for lex in nlp.vocab) + 2 \n",
    "    \n",
    "    # create random vectors for OOV tokens\n",
    "    oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))\n",
    "    oov = oov / oov.sum(axis=1, keepdims=True)\n",
    "    \n",
    "    vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')\n",
    "    vectors[num_vectors:, ] = oov\n",
    "    for lex in nlp.vocab:\n",
    "        if lex.has_vector and lex.vector_norm > 0:\n",
    "            vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector\n",
    "            \n",
    "    sents_as_ids = []\n",
    "    for sent in sents:\n",
    "        doc = nlp(sent)\n",
    "        word_ids = []\n",
    "        \n",
    "        for i, token in enumerate(doc):\n",
    "            # skip odd spaces from tokenizer\n",
    "            if token.has_vector and token.vector_norm == 0:\n",
    "                continue\n",
    "                \n",
    "            if i > max_length:\n",
    "                break\n",
    "                \n",
    "            if token.has_vector:\n",
    "                word_ids.append(token.rank + 1)\n",
    "            else:\n",
    "                # if we don't have a vector, pick an OOV entry\n",
    "                word_ids.append(token.rank % num_oov + num_vectors) \n",
    "                \n",
    "        # there must be a simpler way of generating padded arrays from lists...\n",
    "        word_id_vec = np.zeros((max_length), dtype='int')\n",
    "        clipped_len = min(max_length, len(word_ids))\n",
    "        word_id_vec[:clipped_len] = word_ids[:clipped_len]\n",
    "        sents_as_ids.append(word_id_vec)\n",
    "        \n",
    "        \n",
    "    return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sem_vectors, text_vectors, hypothesis_vectors = create_dataset(nlp, texts, hypotheses, 100, 50, True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts_test,hypotheses_test,labels_test = read_snli('snli/snli_1.0_test.jsonl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "_, text_vectors_test, hypothesis_vectors_test = create_dataset(nlp, texts_test, hypotheses_test, 100, 50, True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use spaCy to tokenize the sentences and return, when available, a semantic vector for each token.  \n",
    "\n",
    "OOV terms (tokens for which no semantic vector is available) are assigned to one of a set of randomly-generated OOV vectors, per (Parikh et al, 2016).\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that we will clip sentences to 50 words maximum."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras import layers, Model, models\n",
    "from keras import backend as K"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building the model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The embedding layer copies the 300-dimensional GloVe vectors into GPU memory.  Per (Parikh et al, 2016), the vectors, which are not adapted during training, are projected down to lower-dimensional vectors using a trained projection matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_embedding(vectors, max_length, projected_dim):\n",
    "    return models.Sequential([\n",
    "        layers.Embedding(\n",
    "            vectors.shape[0],\n",
    "            vectors.shape[1],\n",
    "            input_length=max_length,\n",
    "            weights=[vectors],\n",
    "            trainable=False),\n",
    "        \n",
    "        layers.TimeDistributed(\n",
    "            layers.Dense(projected_dim,\n",
    "                         activation=None,\n",
    "                         use_bias=False))\n",
    "    ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Parikh model makes use of three feedforward blocks that construct nonlinear combinations of their input.  Each block contains two ReLU layers and two dropout layers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_feedforward(num_units=200, activation='relu', dropout_rate=0.2):\n",
    "    return models.Sequential([\n",
    "        layers.Dense(num_units, activation=activation),\n",
    "        layers.Dropout(dropout_rate),\n",
    "        layers.Dense(num_units, activation=activation),\n",
    "        layers.Dropout(dropout_rate)\n",
    "    ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The basic idea of the (Parikh et al, 2016) model is to:\n",
    "\n",
    "1.  *Align*: Construct an alignment of subphrases in the text and hypothesis using an attention-like mechanism, called \"decompositional\" because the layer is applied to each of the two sentences individually rather than to their product.  The dot product of the nonlinear transformations of the inputs is then normalized vertically and horizontally to yield a pair of \"soft\" alignment structures, from text->hypothesis and hypothesis->text.  Concretely, for each word in one sentence, a multinomial distribution is computed over the words of the other sentence, by learning a multinomial logistic with softmax target.\n",
    "2.  *Compare*: Each word is now compared to its aligned phrase using a function modeled as a two-layer feedforward ReLU network.  The output is a high-dimensional representation of the strength of association between word and aligned phrase.\n",
    "3.  *Aggregate*: The comparison vectors are summed, separately, for the text and the hypothesis.  The result is two vectors: one that describes the degree of association of the text to the hypothesis, and the second, of the hypothesis to the text.\n",
    "4.  Finally, these two vectors are processed by a dense layer followed by a softmax classifier, as usual.\n",
    "\n",
    "Note that because in entailment the truth conditions of the consequent must be a subset of those of the antecedent, it is not obvious that we need both vectors in step (3).  Entailment is not symmetric.  It may be enough to just use the hypothesis->text vector.  We will explore this possibility later."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We need a couple of little functions for Lambda layers to normalize and aggregate weights:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalizer(axis):\n",
    "    def _normalize(att_weights):\n",
    "        exp_weights = K.exp(att_weights)\n",
    "        sum_weights = K.sum(exp_weights, axis=axis, keepdims=True)\n",
    "        return exp_weights/sum_weights\n",
    "    return _normalize\n",
    "\n",
    "def sum_word(x):\n",
    "    return K.sum(x, axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model(vectors, max_length, num_hidden, num_classes, projected_dim, entail_dir='both'):\n",
    "    input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')\n",
    "    input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')\n",
    "    \n",
    "    # embeddings (projected)\n",
    "    embed = create_embedding(vectors, max_length, projected_dim)\n",
    "   \n",
    "    a = embed(input1)\n",
    "    b = embed(input2)\n",
    "    \n",
    "    # step 1: attend\n",
    "    F = create_feedforward(num_hidden)\n",
    "    att_weights = layers.dot([F(a), F(b)], axes=-1)\n",
    "    \n",
    "    G = create_feedforward(num_hidden)\n",
    "    \n",
    "    if entail_dir == 'both':\n",
    "        norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
    "        norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
    "        alpha = layers.dot([norm_weights_a, a], axes=1)\n",
    "        beta  = layers.dot([norm_weights_b, b], axes=1)\n",
    "\n",
    "        # step 2: compare\n",
    "        comp1 = layers.concatenate([a, beta])\n",
    "        comp2 = layers.concatenate([b, alpha])\n",
    "        v1 = layers.TimeDistributed(G)(comp1)\n",
    "        v2 = layers.TimeDistributed(G)(comp2)\n",
    "\n",
    "        # step 3: aggregate\n",
    "        v1_sum = layers.Lambda(sum_word)(v1)\n",
    "        v2_sum = layers.Lambda(sum_word)(v2)\n",
    "        concat = layers.concatenate([v1_sum, v2_sum])\n",
    "    elif entail_dir == 'left':\n",
    "        norm_weights_a = layers.Lambda(normalizer(1))(att_weights)\n",
    "        alpha = layers.dot([norm_weights_a, a], axes=1)\n",
    "        comp2 = layers.concatenate([b, alpha])\n",
    "        v2 = layers.TimeDistributed(G)(comp2)\n",
    "        v2_sum = layers.Lambda(sum_word)(v2)\n",
    "        concat = v2_sum\n",
    "    else:\n",
    "        norm_weights_b = layers.Lambda(normalizer(2))(att_weights)\n",
    "        beta  = layers.dot([norm_weights_b, b], axes=1)\n",
    "        comp1 = layers.concatenate([a, beta])\n",
    "        v1 = layers.TimeDistributed(G)(comp1)\n",
    "        v1_sum = layers.Lambda(sum_word)(v1)\n",
    "        concat = v1_sum\n",
    "    \n",
    "    H = create_feedforward(num_hidden)\n",
    "    out = H(concat)\n",
    "    out = layers.Dense(num_classes, activation='softmax')(out)\n",
    "    \n",
    "    model = Model([input1, input2], out)\n",
    "    \n",
    "    model.compile(optimizer='adam',\n",
    "                  loss='categorical_crossentropy',\n",
    "                  metrics=['accuracy'])\n",
    "    return model\n",
    "    \n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "words1 (InputLayer)             (None, 50)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "words2 (InputLayer)             (None, 50)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "sequential_1 (Sequential)       (None, 50, 200)      321381600   words1[0][0]                     \n",
      "                                                                 words2[0][0]                     \n",
      "__________________________________________________________________________________________________\n",
      "sequential_2 (Sequential)       (None, 50, 200)      80400       sequential_1[1][0]               \n",
      "                                                                 sequential_1[2][0]               \n",
      "__________________________________________________________________________________________________\n",
      "dot_1 (Dot)                     (None, 50, 50)       0           sequential_2[1][0]               \n",
      "                                                                 sequential_2[2][0]               \n",
      "__________________________________________________________________________________________________\n",
      "lambda_2 (Lambda)               (None, 50, 50)       0           dot_1[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "lambda_1 (Lambda)               (None, 50, 50)       0           dot_1[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "dot_3 (Dot)                     (None, 50, 200)      0           lambda_2[0][0]                   \n",
      "                                                                 sequential_1[2][0]               \n",
      "__________________________________________________________________________________________________\n",
      "dot_2 (Dot)                     (None, 50, 200)      0           lambda_1[0][0]                   \n",
      "                                                                 sequential_1[1][0]               \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_1 (Concatenate)     (None, 50, 400)      0           sequential_1[1][0]               \n",
      "                                                                 dot_3[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_2 (Concatenate)     (None, 50, 400)      0           sequential_1[2][0]               \n",
      "                                                                 dot_2[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "time_distributed_2 (TimeDistrib (None, 50, 200)      120400      concatenate_1[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "time_distributed_3 (TimeDistrib (None, 50, 200)      120400      concatenate_2[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "lambda_3 (Lambda)               (None, 200)          0           time_distributed_2[0][0]         \n",
      "__________________________________________________________________________________________________\n",
      "lambda_4 (Lambda)               (None, 200)          0           time_distributed_3[0][0]         \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_3 (Concatenate)     (None, 400)          0           lambda_3[0][0]                   \n",
      "                                                                 lambda_4[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "sequential_4 (Sequential)       (None, 200)          120400      concatenate_3[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "dense_8 (Dense)                 (None, 3)            603         sequential_4[1][0]               \n",
      "==================================================================================================\n",
      "Total params: 321,703,403\n",
      "Trainable params: 381,803\n",
      "Non-trainable params: 321,321,600\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "K.clear_session()\n",
    "m = build_model(sem_vectors, 50, 200, 3, 200)\n",
    "m.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The number of trainable parameters, ~381k, is the number given by Parikh et al, so we're on the right track."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training the model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parikh et al use tiny batches of 4, training for 50MM batches, which amounts to around 500 epochs.  Here we'll use large batches to better use the GPU, and train for fewer epochs -- for purposes of this experiment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 549367 samples, validate on 9824 samples\n",
      "Epoch 1/50\n",
      "549367/549367 [==============================] - 34s 62us/step - loss: 0.7599 - acc: 0.6617 - val_loss: 0.5396 - val_acc: 0.7861\n",
      "Epoch 2/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.5611 - acc: 0.7763 - val_loss: 0.4892 - val_acc: 0.8085\n",
      "Epoch 3/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.5212 - acc: 0.7948 - val_loss: 0.4574 - val_acc: 0.8261\n",
      "Epoch 4/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4986 - acc: 0.8045 - val_loss: 0.4410 - val_acc: 0.8274\n",
      "Epoch 5/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4819 - acc: 0.8114 - val_loss: 0.4224 - val_acc: 0.8383\n",
      "Epoch 6/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4714 - acc: 0.8166 - val_loss: 0.4200 - val_acc: 0.8379\n",
      "Epoch 7/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4633 - acc: 0.8203 - val_loss: 0.4098 - val_acc: 0.8457\n",
      "Epoch 8/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4558 - acc: 0.8232 - val_loss: 0.4114 - val_acc: 0.8415\n",
      "Epoch 9/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4508 - acc: 0.8250 - val_loss: 0.4062 - val_acc: 0.8477\n",
      "Epoch 10/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4433 - acc: 0.8286 - val_loss: 0.3982 - val_acc: 0.8486\n",
      "Epoch 11/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4388 - acc: 0.8307 - val_loss: 0.3953 - val_acc: 0.8497\n",
      "Epoch 12/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4351 - acc: 0.8321 - val_loss: 0.3973 - val_acc: 0.8522\n",
      "Epoch 13/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4309 - acc: 0.8342 - val_loss: 0.3939 - val_acc: 0.8539\n",
      "Epoch 14/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4269 - acc: 0.8355 - val_loss: 0.3932 - val_acc: 0.8517\n",
      "Epoch 15/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4247 - acc: 0.8369 - val_loss: 0.3938 - val_acc: 0.8515\n",
      "Epoch 16/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4208 - acc: 0.8379 - val_loss: 0.3936 - val_acc: 0.8504\n",
      "Epoch 17/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4194 - acc: 0.8390 - val_loss: 0.3885 - val_acc: 0.8560\n",
      "Epoch 18/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4162 - acc: 0.8402 - val_loss: 0.3874 - val_acc: 0.8561\n",
      "Epoch 19/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4140 - acc: 0.8409 - val_loss: 0.3889 - val_acc: 0.8545\n",
      "Epoch 20/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4114 - acc: 0.8426 - val_loss: 0.3864 - val_acc: 0.8583\n",
      "Epoch 21/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4092 - acc: 0.8430 - val_loss: 0.3870 - val_acc: 0.8561\n",
      "Epoch 22/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4062 - acc: 0.8442 - val_loss: 0.3852 - val_acc: 0.8577\n",
      "Epoch 23/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4050 - acc: 0.8450 - val_loss: 0.3850 - val_acc: 0.8578\n",
      "Epoch 24/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4035 - acc: 0.8455 - val_loss: 0.3825 - val_acc: 0.8555\n",
      "Epoch 25/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.4018 - acc: 0.8460 - val_loss: 0.3837 - val_acc: 0.8573\n",
      "Epoch 26/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3989 - acc: 0.8476 - val_loss: 0.3843 - val_acc: 0.8599\n",
      "Epoch 27/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3979 - acc: 0.8481 - val_loss: 0.3841 - val_acc: 0.8589\n",
      "Epoch 28/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3967 - acc: 0.8484 - val_loss: 0.3811 - val_acc: 0.8575\n",
      "Epoch 29/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3956 - acc: 0.8492 - val_loss: 0.3829 - val_acc: 0.8589\n",
      "Epoch 30/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3938 - acc: 0.8499 - val_loss: 0.3859 - val_acc: 0.8562\n",
      "Epoch 31/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3925 - acc: 0.8500 - val_loss: 0.3798 - val_acc: 0.8587\n",
      "Epoch 32/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3906 - acc: 0.8509 - val_loss: 0.3834 - val_acc: 0.8569\n",
      "Epoch 33/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3893 - acc: 0.8511 - val_loss: 0.3806 - val_acc: 0.8588\n",
      "Epoch 34/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3885 - acc: 0.8515 - val_loss: 0.3828 - val_acc: 0.8603\n",
      "Epoch 35/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3879 - acc: 0.8520 - val_loss: 0.3800 - val_acc: 0.8594\n",
      "Epoch 36/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3860 - acc: 0.8530 - val_loss: 0.3796 - val_acc: 0.8577\n",
      "Epoch 37/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3856 - acc: 0.8532 - val_loss: 0.3857 - val_acc: 0.8591\n",
      "Epoch 38/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3838 - acc: 0.8535 - val_loss: 0.3835 - val_acc: 0.8603\n",
      "Epoch 39/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3830 - acc: 0.8543 - val_loss: 0.3830 - val_acc: 0.8599\n",
      "Epoch 40/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3818 - acc: 0.8548 - val_loss: 0.3832 - val_acc: 0.8559\n",
      "Epoch 41/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3806 - acc: 0.8551 - val_loss: 0.3845 - val_acc: 0.8553\n",
      "Epoch 42/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3803 - acc: 0.8550 - val_loss: 0.3789 - val_acc: 0.8617\n",
      "Epoch 43/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3791 - acc: 0.8556 - val_loss: 0.3835 - val_acc: 0.8580\n",
      "Epoch 44/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3778 - acc: 0.8565 - val_loss: 0.3799 - val_acc: 0.8580\n",
      "Epoch 45/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3766 - acc: 0.8571 - val_loss: 0.3790 - val_acc: 0.8625\n",
      "Epoch 46/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3770 - acc: 0.8569 - val_loss: 0.3820 - val_acc: 0.8590\n",
      "Epoch 47/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3761 - acc: 0.8573 - val_loss: 0.3831 - val_acc: 0.8581\n",
      "Epoch 48/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3739 - acc: 0.8579 - val_loss: 0.3828 - val_acc: 0.8599\n",
      "Epoch 49/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3738 - acc: 0.8577 - val_loss: 0.3785 - val_acc: 0.8590\n",
      "Epoch 50/50\n",
      "549367/549367 [==============================] - 33s 60us/step - loss: 0.3726 - acc: 0.8580 - val_loss: 0.3820 - val_acc: 0.8585\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f5c9f49c438>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The result is broadly in the region reported by Parikh et al: ~86 vs 86.3%.  The small difference might be accounted by differences in `max_length` (here set at 50), in the training regime, and that here we use Keras' built-in validation splitting rather than the SNLI test set."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experiment: the asymmetric model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It was suggested earlier that, based on the semantics of entailment, the vector representing the strength of association between the hypothesis to the text is all that is needed for classifying the entailment.\n",
    "\n",
    "The following model removes consideration of the complementary vector (text to hypothesis) from the computation.  This will decrease the paramater count slightly, because the final dense layers will be smaller, and speed up the forward pass when predicting, because fewer calculations will be needed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "words2 (InputLayer)             (None, 50)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "words1 (InputLayer)             (None, 50)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "sequential_5 (Sequential)       (None, 50, 200)      321381600   words1[0][0]                     \n",
      "                                                                 words2[0][0]                     \n",
      "__________________________________________________________________________________________________\n",
      "sequential_6 (Sequential)       (None, 50, 200)      80400       sequential_5[1][0]               \n",
      "                                                                 sequential_5[2][0]               \n",
      "__________________________________________________________________________________________________\n",
      "dot_4 (Dot)                     (None, 50, 50)       0           sequential_6[1][0]               \n",
      "                                                                 sequential_6[2][0]               \n",
      "__________________________________________________________________________________________________\n",
      "lambda_5 (Lambda)               (None, 50, 50)       0           dot_4[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "dot_5 (Dot)                     (None, 50, 200)      0           lambda_5[0][0]                   \n",
      "                                                                 sequential_5[1][0]               \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_4 (Concatenate)     (None, 50, 400)      0           sequential_5[2][0]               \n",
      "                                                                 dot_5[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "time_distributed_5 (TimeDistrib (None, 50, 200)      120400      concatenate_4[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "lambda_6 (Lambda)               (None, 200)          0           time_distributed_5[0][0]         \n",
      "__________________________________________________________________________________________________\n",
      "sequential_8 (Sequential)       (None, 200)          80400       lambda_6[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "dense_16 (Dense)                (None, 3)            603         sequential_8[1][0]               \n",
      "==================================================================================================\n",
      "Total params: 321,663,403\n",
      "Trainable params: 341,803\n",
      "Non-trainable params: 321,321,600\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "m1 = build_model(sem_vectors, 50, 200, 3, 200, 'left')\n",
    "m1.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The parameter count has indeed decreased by 40,000, corresponding to the 200x200 smaller H function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 549367 samples, validate on 9824 samples\n",
      "Epoch 1/50\n",
      "549367/549367 [==============================] - 25s 46us/step - loss: 0.7331 - acc: 0.6770 - val_loss: 0.5257 - val_acc: 0.7936\n",
      "Epoch 2/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.5518 - acc: 0.7799 - val_loss: 0.4717 - val_acc: 0.8159\n",
      "Epoch 3/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.5147 - acc: 0.7967 - val_loss: 0.4449 - val_acc: 0.8278\n",
      "Epoch 4/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4948 - acc: 0.8060 - val_loss: 0.4326 - val_acc: 0.8344\n",
      "Epoch 5/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4814 - acc: 0.8122 - val_loss: 0.4247 - val_acc: 0.8359\n",
      "Epoch 6/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4712 - acc: 0.8162 - val_loss: 0.4143 - val_acc: 0.8430\n",
      "Epoch 7/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4635 - acc: 0.8205 - val_loss: 0.4172 - val_acc: 0.8401\n",
      "Epoch 8/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4570 - acc: 0.8223 - val_loss: 0.4106 - val_acc: 0.8422\n",
      "Epoch 9/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4505 - acc: 0.8259 - val_loss: 0.4043 - val_acc: 0.8451\n",
      "Epoch 10/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4459 - acc: 0.8280 - val_loss: 0.4050 - val_acc: 0.8467\n",
      "Epoch 11/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4405 - acc: 0.8300 - val_loss: 0.3975 - val_acc: 0.8481\n",
      "Epoch 12/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4360 - acc: 0.8324 - val_loss: 0.4026 - val_acc: 0.8496\n",
      "Epoch 13/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4327 - acc: 0.8334 - val_loss: 0.4024 - val_acc: 0.8471\n",
      "Epoch 14/50\n",
      "549367/549367 [==============================] - 24s 45us/step - loss: 0.4293 - acc: 0.8350 - val_loss: 0.3955 - val_acc: 0.8496\n",
      "Epoch 15/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4263 - acc: 0.8369 - val_loss: 0.3980 - val_acc: 0.8490\n",
      "Epoch 16/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4236 - acc: 0.8377 - val_loss: 0.3958 - val_acc: 0.8496\n",
      "Epoch 17/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4213 - acc: 0.8384 - val_loss: 0.3954 - val_acc: 0.8496\n",
      "Epoch 18/50\n",
      "549367/549367 [==============================] - 24s 45us/step - loss: 0.4187 - acc: 0.8394 - val_loss: 0.3929 - val_acc: 0.8514\n",
      "Epoch 19/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4157 - acc: 0.8409 - val_loss: 0.3939 - val_acc: 0.8507\n",
      "Epoch 20/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4135 - acc: 0.8417 - val_loss: 0.3953 - val_acc: 0.8522\n",
      "Epoch 21/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4122 - acc: 0.8424 - val_loss: 0.3974 - val_acc: 0.8506\n",
      "Epoch 22/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4099 - acc: 0.8435 - val_loss: 0.3918 - val_acc: 0.8522\n",
      "Epoch 23/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4075 - acc: 0.8443 - val_loss: 0.3901 - val_acc: 0.8513\n",
      "Epoch 24/50\n",
      "549367/549367 [==============================] - 24s 44us/step - loss: 0.4067 - acc: 0.8447 - val_loss: 0.3885 - val_acc: 0.8543\n",
      "Epoch 25/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4047 - acc: 0.8454 - val_loss: 0.3846 - val_acc: 0.8531\n",
      "Epoch 26/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.4031 - acc: 0.8461 - val_loss: 0.3864 - val_acc: 0.8562\n",
      "Epoch 27/50\n",
      "549367/549367 [==============================] - 24s 45us/step - loss: 0.4020 - acc: 0.8467 - val_loss: 0.3874 - val_acc: 0.8546\n",
      "Epoch 28/50\n",
      "549367/549367 [==============================] - 24s 45us/step - loss: 0.4001 - acc: 0.8473 - val_loss: 0.3848 - val_acc: 0.8534\n",
      "Epoch 29/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3991 - acc: 0.8479 - val_loss: 0.3865 - val_acc: 0.8562\n",
      "Epoch 30/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3976 - acc: 0.8484 - val_loss: 0.3833 - val_acc: 0.8574\n",
      "Epoch 31/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3961 - acc: 0.8487 - val_loss: 0.3846 - val_acc: 0.8585\n",
      "Epoch 32/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3942 - acc: 0.8498 - val_loss: 0.3805 - val_acc: 0.8573\n",
      "Epoch 33/50\n",
      "549367/549367 [==============================] - 24s 44us/step - loss: 0.3935 - acc: 0.8503 - val_loss: 0.3856 - val_acc: 0.8579\n",
      "Epoch 34/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3923 - acc: 0.8507 - val_loss: 0.3829 - val_acc: 0.8560\n",
      "Epoch 35/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3920 - acc: 0.8508 - val_loss: 0.3864 - val_acc: 0.8575\n",
      "Epoch 36/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3907 - acc: 0.8516 - val_loss: 0.3873 - val_acc: 0.8563\n",
      "Epoch 37/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3891 - acc: 0.8519 - val_loss: 0.3850 - val_acc: 0.8570\n",
      "Epoch 38/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3872 - acc: 0.8522 - val_loss: 0.3815 - val_acc: 0.8591\n",
      "Epoch 39/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3887 - acc: 0.8520 - val_loss: 0.3829 - val_acc: 0.8590\n",
      "Epoch 40/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3868 - acc: 0.8531 - val_loss: 0.3807 - val_acc: 0.8600\n",
      "Epoch 41/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3859 - acc: 0.8537 - val_loss: 0.3832 - val_acc: 0.8574\n",
      "Epoch 42/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3849 - acc: 0.8537 - val_loss: 0.3850 - val_acc: 0.8576\n",
      "Epoch 43/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3834 - acc: 0.8541 - val_loss: 0.3825 - val_acc: 0.8563\n",
      "Epoch 44/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3829 - acc: 0.8548 - val_loss: 0.3844 - val_acc: 0.8540\n",
      "Epoch 45/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8552 - val_loss: 0.3841 - val_acc: 0.8559\n",
      "Epoch 46/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3816 - acc: 0.8549 - val_loss: 0.3880 - val_acc: 0.8567\n",
      "Epoch 47/50\n",
      "549367/549367 [==============================] - 24s 45us/step - loss: 0.3799 - acc: 0.8559 - val_loss: 0.3767 - val_acc: 0.8635\n",
      "Epoch 48/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3800 - acc: 0.8560 - val_loss: 0.3786 - val_acc: 0.8563\n",
      "Epoch 49/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3781 - acc: 0.8563 - val_loss: 0.3812 - val_acc: 0.8596\n",
      "Epoch 50/50\n",
      "549367/549367 [==============================] - 25s 45us/step - loss: 0.3788 - acc: 0.8560 - val_loss: 0.3782 - val_acc: 0.8601\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f5ca1bf3e48>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m1.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=50,validation_data=([text_vectors_test, hypothesis_vectors_test], labels_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This model performs the same as the slightly more complex model that evaluates alignments in both directions.  Note also that processing time is improved, from 64 down to 48 microseconds per step. \n",
    "\n",
    "Let's now look at an asymmetric model that evaluates text to hypothesis comparisons.  The prediction is that such a model will correctly classify a decent proportion of the exemplars, but not as accurately as the previous two.\n",
    "\n",
    "We'll just use 10 epochs for expediency."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "words1 (InputLayer)             (None, 50)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "words2 (InputLayer)             (None, 50)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "sequential_13 (Sequential)      (None, 50, 200)      321381600   words1[0][0]                     \n",
      "                                                                 words2[0][0]                     \n",
      "__________________________________________________________________________________________________\n",
      "sequential_14 (Sequential)      (None, 50, 200)      80400       sequential_13[1][0]              \n",
      "                                                                 sequential_13[2][0]              \n",
      "__________________________________________________________________________________________________\n",
      "dot_8 (Dot)                     (None, 50, 50)       0           sequential_14[1][0]              \n",
      "                                                                 sequential_14[2][0]              \n",
      "__________________________________________________________________________________________________\n",
      "lambda_9 (Lambda)               (None, 50, 50)       0           dot_8[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "dot_9 (Dot)                     (None, 50, 200)      0           lambda_9[0][0]                   \n",
      "                                                                 sequential_13[2][0]              \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_6 (Concatenate)     (None, 50, 400)      0           sequential_13[1][0]              \n",
      "                                                                 dot_9[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "time_distributed_9 (TimeDistrib (None, 50, 200)      120400      concatenate_6[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "lambda_10 (Lambda)              (None, 200)          0           time_distributed_9[0][0]         \n",
      "__________________________________________________________________________________________________\n",
      "sequential_16 (Sequential)      (None, 200)          80400       lambda_10[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "dense_32 (Dense)                (None, 3)            603         sequential_16[1][0]              \n",
      "==================================================================================================\n",
      "Total params: 321,663,403\n",
      "Trainable params: 341,803\n",
      "Non-trainable params: 321,321,600\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "m2 = build_model(sem_vectors, 50, 200, 3, 200, 'right')\n",
    "m2.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 455226 samples, validate on 113807 samples\n",
      "Epoch 1/10\n",
      "455226/455226 [==============================] - 22s 49us/step - loss: 0.8920 - acc: 0.5771 - val_loss: 0.8001 - val_acc: 0.6435\n",
      "Epoch 2/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.7808 - acc: 0.6553 - val_loss: 0.7267 - val_acc: 0.6855\n",
      "Epoch 3/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.7329 - acc: 0.6825 - val_loss: 0.6966 - val_acc: 0.7006\n",
      "Epoch 4/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.7055 - acc: 0.6978 - val_loss: 0.6713 - val_acc: 0.7150\n",
      "Epoch 5/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.6862 - acc: 0.7081 - val_loss: 0.6533 - val_acc: 0.7253\n",
      "Epoch 6/10\n",
      "455226/455226 [==============================] - 21s 47us/step - loss: 0.6694 - acc: 0.7179 - val_loss: 0.6472 - val_acc: 0.7277\n",
      "Epoch 7/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.6555 - acc: 0.7252 - val_loss: 0.6338 - val_acc: 0.7347\n",
      "Epoch 8/10\n",
      "455226/455226 [==============================] - 22s 48us/step - loss: 0.6434 - acc: 0.7310 - val_loss: 0.6246 - val_acc: 0.7385\n",
      "Epoch 9/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.6325 - acc: 0.7367 - val_loss: 0.6164 - val_acc: 0.7424\n",
      "Epoch 10/10\n",
      "455226/455226 [==============================] - 22s 47us/step - loss: 0.6216 - acc: 0.7426 - val_loss: 0.6082 - val_acc: 0.7478\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa6850cf080>"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m2.fit([text_vectors, hypothesis_vectors], labels, batch_size=1024, epochs=10,validation_split=.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Comparing this fit to the validation accuracy of the previous two models after 10 epochs, we observe that its accuracy is roughly 10% lower.\n",
    "\n",
    "It is reassuring that the neural modeling here reproduces what we know from the semantics of natural language!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/examples/pipeline/fix_space_entities.py
+++ b/examples/pipeline/fix_space_entities.py
@ -0,0 +1,27 @@
 '''Demonstrate adding a rule-based component that forces some tokens to not
 be entities, before the NER tagger is applied. This is used to hotfix the issue
 in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
 '''
 import spacy
 from spacy.attrs import ENT_IOB
 def fix_space_tags(doc):
    ent_iobs = doc.to_array([ENT_IOB])
    for i, token in enumerate(doc):
        if token.is_space:
            # Sets 'O' tag (0 is None, so I is 1, O is 2)
            ent_iobs[i] = 2
    doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
    return doc
 def main():
    nlp = spacy.load('en_core_web_sm')
    text = u'''This is some crazy test where I dont need an Apple                Watch to make things bug'''
    doc = nlp(text)
    print('Before', doc.ents)
    nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
    doc = nlp(text)
    print('After', doc.ents)
 if __name__ == '__main__':
    main()
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function
 import plac
 import random
 import spacy
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # training data: texts, heads and dependency labels
@ -63,7 +64,7 @@ TRAIN_DATA = [
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, output_dir=None, n_iter=5):
+def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
-            print(losses)
+            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print('Losses', losses)
    # test the trained model
    test_model(nlp)
@ -135,7 +139,8 @@ if __name__ == '__main__':
    # [
    #   ('find', 'ROOT', 'find'),
    #   ('cheapest', 'QUALITY', 'gym'),
-    #   ('gym', 'PLACE', 'find')
+    #   ('gym', 'PLACE', 'find'),
    #   ('near', 'ATTRIBUTE', 'gym'),
    #   ('work', 'LOCATION', 'near')
    # ]
    # show me the best hotel in berlin
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -15,6 +15,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # training data
@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
-                    [text],  # batch of texts
+                    texts,  # batch of texts
-                    [annotations],  # batch of annotations
+                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
-            print(losses)
+            print('Losses', losses)
    # test the trained model
    for text, _ in TRAIN_DATA:
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -31,6 +31,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # new entity label
@ -73,7 +74,7 @@ TRAIN_DATA = [
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
+def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
-                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
-            print(losses)
+            print('Losses', losses)
    # test the trained model
    test_text = 'Do you like horses?'
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -13,6 +13,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # training data
@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
-            print(losses)
+            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print('Losses', losses)
    # test the trained model
    test_text = "I like securities."
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -16,6 +16,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
 # You need to define a mapping from your data's part-of-speech tag names to the
@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
-        for text, annotations in TRAIN_DATA:
+        # batch up the examples using spaCy's minibatch
-            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
+        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
-        print(losses)
+        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
        print('Losses', losses)
    # test the trained model
    test_text = "I like blue eggs"
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@ cython>=0.25
 numpy>=1.15.0
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc==7.0.0.dev1
+thinc==7.0.0.dev2
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
@ -11,7 +11,11 @@ ujson>=1.35
 dill>=0.2,<0.3
 regex==2018.01.10
 requests>=2.13.0,<3.0.0
 jsonschema>=2.6.0,<3.0.0
 wasabi>=0.0.8,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 pytest>=4.0.0,<5.0.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-pathlib==1.0.1; python_version < "3.4"
+flake8>=3.5.0,<3.6.0
--- a/setup.py
+++ b/setup.py
@ -200,13 +200,15 @@ def setup_package():
                "murmurhash>=0.28.0,<1.1.0",
                "cymem>=2.0.2,<2.1.0",
                "preshed>=2.0.1,<2.1.0",
-                "thinc==7.0.0.dev1",
+                "thinc==7.0.0.dev2",
                "blis>=0.2.2,<0.3.0",
                "plac<1.0.0,>=0.9.6",
                "ujson>=1.35",
                "regex==2018.01.10",
                "dill>=0.2,<0.3",
                "requests>=2.13.0,<3.0.0",
                "jsonschema>=2.6.0,<3.0.0",
                "wasabi>=0.0.8,<1.1.0",
                'pathlib==1.0.1; python_version < "3.4"',
            ],
            setup_requires=["wheel"],
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,9 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 import warnings
 warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 # These are imported as part of the API
 from thinc.neural.util import prefer_gpu, require_gpu
 from .cli.info import info as cli_info
 from .glossary import explain
 from .about import __version__
@ -12,7 +16,7 @@ from . import util
 def load(name, **overrides):
-    depr_path = overrides.get('path')
+    depr_path = overrides.get("path")
    if depr_path not in (True, False, None):
        deprecation_warning(Warnings.W001.format(path=depr_path))
    return util.load_model(name, **overrides)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -1,40 +1,41 @@
 # coding: utf8
 from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 # from __future__ import unicode_literals
-if __name__ == '__main__':
+if __name__ == "__main__":
    import plac
    import sys
    from wasabi import Printer
    from spacy.cli import download, link, info, package, train, pretrain, convert
-    from spacy.cli import vocab, init_model, profile, evaluate, validate
+    from spacy.cli import init_model, profile, evaluate, validate
-    from spacy.cli import ud_train, ud_evaluate
+    from spacy.cli import ud_train, ud_evaluate, debug_data
-    from spacy.util import prints
+
    msg = Printer()
    commands = {
-        'download': download,
+        "download": download,
-        'link': link,
+        "link": link,
-        'info': info,
+        "info": info,
-        'train': train,
+        "train": train,
-        'pretrain': pretrain,
+        "pretrain": pretrain,
-        'ud-train': ud_train,
+        "debug-data": debug_data,
-        'evaluate': evaluate,
+        "ud-train": ud_train,
-        'ud-evaluate': ud_evaluate,
+        "evaluate": evaluate,
-        'convert': convert,
+        "ud-evaluate": ud_evaluate,
-        'package': package,
+        "convert": convert,
-        'vocab': vocab,
+        "package": package,
-        'init-model': init_model,
+        "init-model": init_model,
-        'profile': profile,
+        "profile": profile,
-        'validate': validate
+        "validate": validate,
    }
    if len(sys.argv) == 1:
-        prints(', '.join(commands), title="Available commands", exits=1)
+        msg.info("Available commands", ", ".join(commands), exits=1)
    command = sys.argv.pop(1)
-    sys.argv[0] = 'spacy %s' % command
+    sys.argv[0] = "spacy %s" % command
    if command in commands:
        plac.call(commands[command], sys.argv[1:])
    else:
-        prints(
+        available = "Available: {}".format(", ".join(commands))
-            "Available: %s" % ', '.join(commands),
+        msg.fail("Unknown command: {}".format(command), available, exits=1)
            title="Unknown command: %s" % command,
            exits=1)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -14,8 +14,7 @@ from thinc.api import uniqued, wrap, noop
 from thinc.api import with_square_sequences
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
+from thinc.neural.util import get_array_module
 from thinc.neural._lsuv import svd_orthonormal
 from thinc.neural.optimizers import Adam
 from thinc import describe
@ -30,39 +29,39 @@ from . import util
 try:
    import torch.nn
    from thinc.extra.wrappers import PyTorchWrapperRNN
-except:
+except ImportError:
    torch = None
-VECTORS_KEY = 'spacy_pretrained_vectors'
+VECTORS_KEY = "spacy_pretrained_vectors"
 def cosine(vec1, vec2):
    xp = get_array_module(vec1)
    norm1 = xp.linalg.norm(vec1)
    norm2 = xp.linalg.norm(vec2)
-    if norm1 == 0. or norm2 == 0.:
+    if norm1 == 0.0 or norm2 == 0.0:
        return 0
    else:
        return vec1.dot(vec2) / (norm1 * norm2)
 def create_default_optimizer(ops, **cfg):
-    learn_rate = util.env_opt('learn_rate', 0.001)
+    learn_rate = util.env_opt("learn_rate", 0.001)
-    beta1 = util.env_opt('optimizer_B1', 0.8)
+    beta1 = util.env_opt("optimizer_B1", 0.8)
-    beta2 = util.env_opt('optimizer_B2', 0.8)
+    beta2 = util.env_opt("optimizer_B2", 0.8)
-    eps = util.env_opt('optimizer_eps', 0.00001)
+    eps = util.env_opt("optimizer_eps", 0.00001)
-    L2 = util.env_opt('L2_penalty', 1e-6)
+    L2 = util.env_opt("L2_penalty", 1e-6)
-    max_grad_norm = util.env_opt('grad_norm_clip', 5.)
+    max_grad_norm = util.env_opt("grad_norm_clip", 5.0)
-    optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
+    optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
                     beta2=beta2, eps=eps)
    optimizer.max_grad_norm = max_grad_norm
    optimizer.device = ops.device
    return optimizer
@layerize
-def _flatten_add_lengths(seqs, pad=0, drop=0.):
+def _flatten_add_lengths(seqs, pad=0, drop=0.0):
    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
    def finish_update(d_X, sgd=None):
        return ops.unflatten(d_X, lengths, pad=pad)
@ -74,14 +73,15 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
 def _zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
    model.on_data_hooks.append(_zero_init_impl)
    if model.W is not None:
-        model.W.fill(0.)
+        model.W.fill(0.0)
    return model
@layerize
-def _preprocess_doc(docs, drop=0.):
+def _preprocess_doc(docs, drop=0.0):
    keys = [doc.to_array(LOWER) for doc in docs]
    ops = Model.ops
    # The dtype here matches what thinc is expecting -- which differs per
@ -89,11 +89,12 @@ def _preprocess_doc(docs, drop=0.):
    # is fixed on Thinc's side.
    lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
    keys = ops.xp.concatenate(keys)
-    vals = ops.allocate(keys.shape) + 1.
+    vals = ops.allocate(keys.shape) + 1.0
    return (keys, vals, lengths), None
@layerize
-def _preprocess_doc_bigrams(docs, drop=0.):
+def _preprocess_doc_bigrams(docs, drop=0.0):
    unigrams = [doc.to_array(LOWER) for doc in docs]
    ops = Model.ops
    bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
@ -104,27 +105,29 @@ def _preprocess_doc_bigrams(docs, drop=0.):
    # is fixed on Thinc's side.
    lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
    keys = ops.xp.concatenate(keys)
-    vals = ops.asarray(ops.xp.concatenate(vals), dtype='f')
+    vals = ops.asarray(ops.xp.concatenate(vals), dtype="f")
    return (keys, vals, lengths), None
-@describe.on_data(_set_dimensions_if_needed,
+@describe.on_data(
-    lambda model, X, y: model.init_weights(model))
+    _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
 )
@describe.attributes(
    nI=Dimension("Input size"),
    nF=Dimension("Number of features"),
    nO=Dimension("Output size"),
    nP=Dimension("Maxout pieces"),
-    W=Synapses("Weights matrix",
+    W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
-        lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
+    b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
-    b=Biases("Bias vector",
+    pad=Synapses(
-        lambda obj: (obj.nO, obj.nP)),
+        "Pad",
    pad=Synapses("Pad",
        lambda obj: (1, obj.nF, obj.nO, obj.nP),
-        lambda M, ops: ops.normal_init(M, 1.)),
+        lambda M, ops: ops.normal_init(M, 1.0),
    ),
    d_W=Gradient("W"),
    d_pad=Gradient("pad"),
-    d_b=Gradient("b"))
+    d_b=Gradient("b"),
 )
 class PrecomputableAffine(Model):
    def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
        Model.__init__(self, **kwargs)
@ -133,9 +136,10 @@ class PrecomputableAffine(Model):
        self.nI = nI
        self.nF = nF
-    def begin_update(self, X, drop=0.):
+    def begin_update(self, X, drop=0.0):
-        Yf = self.ops.gemm(X,
+        Yf = self.ops.gemm(
-            self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
+            X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
        )
        Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
        Yf = self._add_padding(Yf)
@ -146,15 +150,16 @@ class PrecomputableAffine(Model):
            Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
            self.d_b += dY.sum(axis=0)
-            dY = dY.reshape((dY.shape[0], self.nO*self.nP))
+            dY = dY.reshape((dY.shape[0], self.nO * self.nP))
            Wopfi = self.W.transpose((1, 2, 0, 3))
            Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
-            Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
+            Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
-            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
+            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
            # Reuse the buffer
-            dWopfi = Wopfi; dWopfi.fill(0.)
+            dWopfi = Wopfi
            dWopfi.fill(0.0)
            self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
            dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
            # (o, p, f, i) --> (f, o, p, i)
@ -163,6 +168,7 @@ class PrecomputableAffine(Model):
            if sgd is not None:
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
            return dXf.reshape((dXf.shape[0], self.nF, self.nI))
        return Yf, backward
    def _add_padding(self, Yf):
@ -171,7 +177,7 @@ class PrecomputableAffine(Model):
    def _backprop_padding(self, dY, ids):
        # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        mask = ids < 0.
+        mask = ids < 0.0
        mask = mask.sum(axis=1)
        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
        self.d_pad += d_pad.sum(axis=0)
@ -179,33 +185,36 @@ class PrecomputableAffine(Model):
    @staticmethod
    def init_weights(model):
-        '''This is like the 'layer sequential unit variance', but instead
+        """This is like the 'layer sequential unit variance', but instead
        of taking the actual inputs, we randomly generate whitened data.
        Why's this all so complicated? We have a huge number of inputs,
        and the maxout unit makes guessing the dynamics tricky. Instead
        we set the maxout weights to values that empirically result in
        whitened outputs given whitened inputs.
-        '''
+        """
-        if (model.W**2).sum() != 0.:
+        if (model.W ** 2).sum() != 0.0:
            return
        ops = model.ops
        xp = ops.xp
        ops.normal_init(model.W, model.nF * model.nI, inplace=True)
-        ids = ops.allocate((5000, model.nF), dtype='f')
+        ids = ops.allocate((5000, model.nF), dtype="f")
        ids += xp.random.uniform(0, 1000, ids.shape)
-        ids = ops.asarray(ids, dtype='i')
+        ids = ops.asarray(ids, dtype="i")
-        tokvecs = ops.allocate((5000, model.nI), dtype='f')
+        tokvecs = ops.allocate((5000, model.nI), dtype="f")
-        tokvecs += xp.random.normal(loc=0., scale=1.,
+        tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-                    size=tokvecs.size).reshape(tokvecs.shape)
+            tokvecs.shape
        )
        def predict(ids, tokvecs):
            # nS ids. nW tokvecs. Exclude the padding array.
-            hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
+            hiddens = model(tokvecs[:-1])  # (nW, f, o, p)
-            vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype='f')
+            vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
            # need nS vectors
-            hiddens = hiddens.reshape((hiddens.shape[0] * model.nF, model.nO * model.nP))
+            hiddens = hiddens.reshape(
                (hiddens.shape[0] * model.nF, model.nO * model.nP)
            )
            model.ops.scatter_add(vectors, ids.flatten(), hiddens)
            vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
            vectors += model.b
@ -238,7 +247,8 @@ def link_vectors_to_models(vocab):
        if vectors.data.size != 0:
            print(
                "Warning: Unnamed vectors -- this won't allow multiple vectors "
-                "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape)
+                "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape
            )
    ops = Model.ops
    for word in vocab:
        if word.orth in vectors.key2row:
@ -254,28 +264,31 @@ def link_vectors_to_models(vocab):
 def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
    if depth == 0:
        return layerize(noop())
-    model = torch.nn.LSTM(nI, nO//2, depth, bidirectional=True, dropout=dropout)
+    model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
    return with_square_sequences(PyTorchWrapperRNN(model))
 def Tok2Vec(width, embed_size, **kwargs):
-    pretrained_vectors = kwargs.get('pretrained_vectors', None)
+    pretrained_vectors = kwargs.get("pretrained_vectors", None)
-    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
+    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 2)
-    subword_features = kwargs.get('subword_features', True)
+    subword_features = kwargs.get("subword_features", True)
-    conv_depth = kwargs.get('conv_depth', 4)
+    conv_depth = kwargs.get("conv_depth", 4)
-    bilstm_depth = kwargs.get('bilstm_depth', 0)
+    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
+    with Model.define_operators(
-                                 '+': add, '*': reapply}):
+        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
-        norm = HashEmbed(width, embed_size, column=cols.index(NORM),
+    ):
-                         name='embed_norm')
+        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        if subword_features:
-            prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
+            prefix = HashEmbed(
-                               name='embed_prefix')
+                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
-            suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
+            )
-                               name='embed_suffix')
+            suffix = HashEmbed(
-            shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
+                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
-                              name='embed_shape')
+            )
            shape = HashEmbed(
                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
            )
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
@ -284,28 +297,29 @@ def Tok2Vec(width, embed_size, **kwargs):
            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
-                    >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
+                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            else:
                embed = uniqued(
-                    (glove | norm)
+                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
-                    >> LN(Maxout(width, width*2, pieces=3)), column=cols.index(ORTH))
+                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape)
-                >> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
+                >> LN(Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        else:
            embed = norm
        convolution = Residual(
            ExtractWindow(nW=1)
-            >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
+            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
-        tok2vec = (
+        tok2vec = FeatureExtracter(cols) >> with_flatten(
-            FeatureExtracter(cols)
+            embed >> convolution ** conv_depth, pad=conv_depth
            >> with_flatten(
                embed
                >> convolution ** conv_depth, pad=conv_depth
            )
        )
        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
@ -316,7 +330,7 @@ def Tok2Vec(width, embed_size, **kwargs):
 def reapply(layer, n_times):
-    def reapply_fwd(X, drop=0.):
+    def reapply_fwd(X, drop=0.0):
        backprops = []
        for i in range(n_times):
            Y, backprop = layer.begin_update(X, drop=drop)
@ -334,12 +348,14 @@ def reapply(layer, n_times):
            return dX
        return Y, reapply_bwd
    return wrap(reapply_fwd, layer)
 def asarray(ops, dtype):
-    def forward(X, drop=0.):
+    def forward(X, drop=0.0):
        return ops.asarray(X, dtype=dtype), None
    return layerize(forward)
@ -347,7 +363,7 @@ def _divide_array(X, size):
    parts = []
    index = 0
    while index < len(X):
-        parts.append(X[index:index + size])
+        parts.append(X[index : index + size])
        index += size
    return parts
@ -356,7 +372,7 @@ def get_col(idx):
    if idx < 0:
        raise IndexError(Errors.E066.format(value=idx))
-    def forward(X, drop=0.):
+    def forward(X, drop=0.0):
        if isinstance(X, numpy.ndarray):
            ops = NumpyOps()
        else:
@ -377,7 +393,7 @@ def doc2feats(cols=None):
    if cols is None:
        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    def forward(docs, drop=0.):
+    def forward(docs, drop=0.0):
        feats = []
        for doc in docs:
            feats.append(doc.to_array(cols))
@ -389,13 +405,14 @@ def doc2feats(cols=None):
 def print_shape(prefix):
-    def forward(X, drop=0.):
+    def forward(X, drop=0.0):
        return X, lambda dX, **kwargs: dX
    return layerize(forward)
@layerize
-def get_token_vectors(tokens_attrs_vectors, drop=0.):
+def get_token_vectors(tokens_attrs_vectors, drop=0.0):
    tokens, attrs, vectors = tokens_attrs_vectors
    def backward(d_output, sgd=None):
@ -405,17 +422,17 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
@layerize
-def logistic(X, drop=0.):
+def logistic(X, drop=0.0):
    xp = get_array_module(X)
    if not isinstance(X, xp.ndarray):
        X = xp.asarray(X)
    # Clip to range (-10, 10)
-    X = xp.minimum(X, 10., X)
+    X = xp.minimum(X, 10.0, X)
-    X = xp.maximum(X, -10., X)
+    X = xp.maximum(X, -10.0, X)
-    Y = 1. / (1. + xp.exp(-X))
+    Y = 1.0 / (1.0 + xp.exp(-X))
    def logistic_bwd(dY, sgd=None):
-        dX = dY * (Y * (1-Y))
+        dX = dY * (Y * (1 - Y))
        return dX
    return Y, logistic_bwd
@ -424,12 +441,13 @@ def logistic(X, drop=0.):
 def zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
    model.on_data_hooks.append(_zero_init_impl)
    return model
@layerize
-def preprocess_doc(docs, drop=0.):
+def preprocess_doc(docs, drop=0.0):
    keys = [doc.to_array([LOWER]) for doc in docs]
    ops = Model.ops
    lengths = ops.asarray([arr.shape[0] for arr in keys])
@ -439,31 +457,32 @@ def preprocess_doc(docs, drop=0.):
 def getitem(i):
-    def getitem_fwd(X, drop=0.):
+    def getitem_fwd(X, drop=0.0):
        return X[i], None
    return layerize(getitem_fwd)
 def build_tagger_model(nr_class, **cfg):
-    embed_size = util.env_opt('embed_size', 2000)
+    embed_size = util.env_opt("embed_size", 2000)
-    if 'token_vector_width' in cfg:
+    if "token_vector_width" in cfg:
-        token_vector_width = cfg['token_vector_width']
+        token_vector_width = cfg["token_vector_width"]
    else:
-        token_vector_width = util.env_opt('token_vector_width', 96)
+        token_vector_width = util.env_opt("token_vector_width", 96)
-    pretrained_vectors = cfg.get('pretrained_vectors')
+    pretrained_vectors = cfg.get("pretrained_vectors")
-    subword_features = cfg.get('subword_features', True)
+    subword_features = cfg.get("subword_features", True)
-    with Model.define_operators({'>>': chain, '+': add}):
+    with Model.define_operators({">>": chain, "+": add}):
-        if 'tok2vec' in cfg:
+        if "tok2vec" in cfg:
-            tok2vec = cfg['tok2vec']
+            tok2vec = cfg["tok2vec"]
        else:
-            tok2vec = Tok2Vec(token_vector_width, embed_size,
+            tok2vec = Tok2Vec(
-                              subword_features=subword_features,
+                token_vector_width,
-                              pretrained_vectors=pretrained_vectors)
+                embed_size,
                subword_features=subword_features,
                pretrained_vectors=pretrained_vectors,
            )
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
-        model = (
+        model = tok2vec >> softmax
            tok2vec
            >> softmax
        )
    model.nI = None
    model.tok2vec = tok2vec
    model.softmax = softmax
@ -471,10 +490,10 @@ def build_tagger_model(nr_class, **cfg):
@layerize
-def SpacyVectors(docs, drop=0.):
+def SpacyVectors(docs, drop=0.0):
    batch = []
    for doc in docs:
-        indices = numpy.zeros((len(doc),), dtype='i')
+        indices = numpy.zeros((len(doc),), dtype="i")
        for i, word in enumerate(doc):
            if word.orth in doc.vocab.vectors.key2row:
                indices[i] = doc.vocab.vectors.key2row[word.orth]
@ -486,12 +505,11 @@ def SpacyVectors(docs, drop=0.):
 def build_text_classifier(nr_class, width=64, **cfg):
-    depth = cfg.get('depth', 2)
+    depth = cfg.get("depth", 2)
-    nr_vector = cfg.get('nr_vector', 5000)
+    nr_vector = cfg.get("nr_vector", 5000)
-    pretrained_dims = cfg.get('pretrained_dims', 0)
+    pretrained_dims = cfg.get("pretrained_dims", 0)
-    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
+    with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
-                                 '**': clone}):
+        if cfg.get("low_data") and pretrained_dims:
        if cfg.get('low_data') and pretrained_dims:
            model = (
                SpacyVectors
                >> flatten_add_lengths
@ -505,41 +523,35 @@ def build_text_classifier(nr_class, width=64, **cfg):
            return model
        lower = HashEmbed(width, nr_vector, column=1)
-        prefix = HashEmbed(width//2, nr_vector, column=2)
+        prefix = HashEmbed(width // 2, nr_vector, column=2)
-        suffix = HashEmbed(width//2, nr_vector, column=3)
+        suffix = HashEmbed(width // 2, nr_vector, column=3)
-        shape = HashEmbed(width//2, nr_vector, column=4)
+        shape = HashEmbed(width // 2, nr_vector, column=4)
-        trained_vectors = (
+        trained_vectors = FeatureExtracter(
-            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
+            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
-            >> with_flatten(
+        ) >> with_flatten(
-                uniqued(
+            uniqued(
-                    (lower | prefix | suffix | shape)
+                (lower | prefix | suffix | shape)
-                    >> LN(Maxout(width, width+(width//2)*3)),
+                >> LN(Maxout(width, width + (width // 2) * 3)),
-                    column=0
+                column=0,
                )
            )
        )
        if pretrained_dims:
-            static_vectors = (
+            static_vectors = SpacyVectors >> with_flatten(
-                SpacyVectors
+                Affine(width, pretrained_dims)
                >> with_flatten(Affine(width, pretrained_dims))
            )
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
-            vectors_width = width*2
+            vectors_width = width * 2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
-        tok2vec = (
+        tok2vec = vectors >> with_flatten(
-            vectors
+            LN(Maxout(width, vectors_width))
-            >> with_flatten(
+            >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
-                LN(Maxout(width, vectors_width))
+            pad=depth,
                >> Residual(
                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
                ) ** depth, pad=depth
            )
        )
        cnn_model = (
            tok2vec
@ -550,13 +562,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )
-        linear_model = (
+        linear_model = _preprocess_doc >> LinearModel(nr_class)
            _preprocess_doc
            >> LinearModel(nr_class)
        )
        model = (
            (linear_model | cnn_model)
-            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
+            >> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
            >> logistic
        )
        model.tok2vec = tok2vec
@ -566,9 +575,9 @@ def build_text_classifier(nr_class, width=64, **cfg):
@layerize
-def flatten(seqs, drop=0.):
+def flatten(seqs, drop=0.0):
    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
    def finish_update(d_X, sgd=None):
        return ops.unflatten(d_X, lengths, pad=0)
@ -583,14 +592,14 @@ def concatenate_lists(*layers, **kwargs):  # pragma: no cover
    """
    if not layers:
        return noop()
-    drop_factor = kwargs.get('drop_factor', 1.0)
+    drop_factor = kwargs.get("drop_factor", 1.0)
    ops = layers[0].ops
    layers = [chain(layer, flatten) for layer in layers]
    concat = concatenate(*layers)
-    def concatenate_lists_fwd(Xs, drop=0.):
+    def concatenate_lists_fwd(Xs, drop=0.0):
        drop *= drop_factor
-        lengths = ops.asarray([len(X) for X in Xs], dtype='i')
+        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
        ys = ops.unflatten(flat_y, lengths)
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,16 +1,17 @@
 # inspired from:
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 # fmt: off
-__title__ = 'spacy-nightly'
+__title__ = "spacy-nightly"
-__version__ = '2.1.0a3'
+__version__ = "2.1.0a3"
-__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
+__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
-__uri__ = 'https://spacy.io'
+__uri__ = "https://spacy.io"
-__author__ = 'Explosion AI'
+__author__ = "Explosion AI"
-__email__ = 'contact@explosion.ai'
+__email__ = "contact@explosion.ai"
-__license__ = 'MIT'
+__license__ = "MIT"
 __release__ = False
-__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
+__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
-__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
+__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'
+__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,14 +1,13 @@
-from .download import download
+from .download import download  # noqa: F401
-from .info import info
+from .info import info  # noqa: F401
-from .link import link
+from .link import link  # noqa: F401
-from .package import package
+from .package import package  # noqa: F401
-from .profile import profile
+from .profile import profile  # noqa: F401
-from .train import train
+from .train import train  # noqa: F401
-from .pretrain import pretrain
+from .pretrain import pretrain  # noqa: F401
-from .evaluate import evaluate
+from .debug_data import debug_data  # noqa: F401
-from .convert import convert
+from .evaluate import evaluate  # noqa: F401
-from .vocab import make_vocab as vocab
+from .convert import convert  # noqa: F401
-from .init_model import init_model
+from .init_model import init_model  # noqa: F401
-from .validate import validate
+from .validate import validate  # noqa: F401
-from .ud_train import main as ud_train
+from .ud import ud_train, ud_evaluate  # noqa: F401
 from .conll17_ud_eval import main as ud_evaluate
--- a/spacy/cli/_messages.py
+++ b/spacy/cli/_messages.py
@ -2,6 +2,8 @@
 from __future__ import unicode_literals
 # fmt: off
 class Messages(object):
    M001 = ("Download successful but linking failed")
    M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
@ -73,3 +75,31 @@ class Messages(object):
    M052 = ("Not a valid meta.json format")
    M053 = ("Expected dict but got: {meta_type}")
    M054 = ("No --lang specified, but tokenization required.")
    M055 = ("Training pipeline: {pipeline}")
    M056 = ("Starting with base model '{model}'")
    M057 = ("Starting with blank model '{model}'")
    M058 = ("Loading vector from model '{model}'")
    M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
    M060 = ("Counting training words (limit={limit})")
    M061 = ("\nSaving model...")
    M062 = ("Output directory is not empty.")
    M063 = ("Incompatible arguments")
    M064 = ("The -f and -c arguments are deprecated, and not compatible with "
            "the -j argument, which should specify the same information. "
            "Either merge the frequencies and clusters data into the "
            "JSONL-formatted file (recommended), or use only the -f and -c "
            "files, without the other lexical attributes.")
    M065 = ("This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.")
    M066 = ("Saved model to output directory")
    M067 = ("Can't find lexical data")
    M068 = ("Sucessfully compiled vocab and vectors, and saved model")
    M069 = ("Unknown file type: '{name}'")
    M070 = ("Supported file types: '{options}'")
    M071 = ("Loaded pretrained tok2vec for: {components}")
    M072 = ("Model language ('{model_lang}') doesn't match language specified "
            "as `lang` argument ('{lang}') ")
 # fmt: on
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -3,49 +3,91 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path
 from wasabi import Printer
 from ..util import write_jsonl, write_json
 from ..compat import json_dumps, path2str
 from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
 from ._messages import Messages
-from ..util import prints
+
 # Converters are matched by file extension. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
 CONVERTERS = {
-    'conllubio': conllubio2json,
+    "conllubio": conllubio2json,
-    'conllu': conllu2json,
+    "conllu": conllu2json,
-    'conll': conllu2json,
+    "conll": conllu2json,
-    'ner': conll_ner2json,
+    "ner": conll_ner2json,
-    'iob': iob2json,
+    "iob": iob2json,
-    'jsonl': ner_jsonl2json
+    "jsonl": ner_jsonl2json,
 }
 # File types
 FILE_TYPES = ("json", "jsonl")
@plac.annotations(
-    input_file=("input file", "positional", None, str),
+    input_file=("Input file", "positional", None, str),
-    output_dir=("output directory for converted file", "positional", None, str),
+    output_dir=("Output directory for converted file", "positional", None, str),
    file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
    n_sents=("Number of sentences per doc", "option", "n", int),
    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
-    morphology=("Enable appending morphology to tags", "flag", "m", bool))
+    morphology=("Enable appending morphology to tags", "flag", "m", bool),
-def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
+)
-        lang=None):
+def convert(
    input_file,
    output_dir="-",
    file_type="jsonl",
    n_sents=1,
    morphology=False,
    converter="auto",
    lang=None,
 ):
    """
    Convert files into JSON format for use with train command and other
-    experiment management functions.
+    experiment management functions. If no output_dir is specified, the data
    is written to stdout, so you can pipe them forward to a JSONL file:
    $ spacy convert some_file.conllu > some_file.jsonl
    """
    msg = Printer()
    input_path = Path(input_file)
-    output_path = Path(output_dir)
+    if file_type not in FILE_TYPES:
        msg.fail(
            Messages.M069.format(name=file_type),
            Messages.M070.format(options=", ".join(FILE_TYPES)),
            exits=1,
        )
    if not input_path.exists():
-        prints(input_path, title=Messages.M028, exits=1)
+        msg.fail(Messages.M028, input_path, exits=1)
-    if not output_path.exists():
+    if output_dir != "-" and not Path(output_dir).exists():
-        prints(output_path, title=Messages.M029, exits=1)
+        msg.fail(Messages.M029, output_dir, exits=1)
-    if converter == 'auto':
+    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
-            prints(Messages.M031.format(converter=converter),
+        msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
-                   title=Messages.M030, exits=1)
+    # Use converter function to convert data
    func = CONVERTERS[converter]
-    func(input_path, output_path,
+    input_data = input_path.open("r", encoding="utf-8").read()
-         n_sents=n_sents, use_morphology=morphology, lang=lang)
+    data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            write_json(output_file, data)
        elif file_type == "jsonl":
            write_jsonl(output_file, data)
        msg.good(
            Messages.M032.format(name=path2str(output_file)),
            Messages.M033.format(n_docs=len(data)),
        )
    else:
        # Print to stdout
        if file_type == "json":
            print(json_dumps(data))
        elif file_type == "jsonl":
            for line in data:
                print(json_dumps(line))
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,5 +1,5 @@
-from .conllu2json import conllu2json
+from .conllu2json import conllu2json  # noqa: F401
-from .conllubio2json import conllubio2json
+from .conllubio2json import conllubio2json  # noqa: F401
-from .iob2json import iob2json
+from .iob2json import iob2json  # noqa: F401
-from .conll_ner2json import conll_ner2json
+from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2json import ner_jsonl2json
+from .jsonl2json import ner_jsonl2json  # noqa: F401
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -1,52 +1,38 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .._messages import Messages
 from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo
-def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
+def conll_ner2json(input_data, **kwargs):
    """
    Convert files in the CoNLL-2003 NER format into JSON format for use with
    train cli.
    """
-    docs = read_conll_ner(input_path)
+    delimit_docs = "-DOCSTART- -X- O O"
    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
    prints(Messages.M033.format(n_docs=len(docs)),
           title=Messages.M032.format(name=path2str(output_file)))
 def read_conll_ner(input_path):
    text = input_path.open('r', encoding='utf-8').read()
    i = 0
    delimit_docs = '-DOCSTART- -X- O O'
    output_docs = []
-    for doc in text.strip().split(delimit_docs):
+    for doc in input_data.strip().split(delimit_docs):
        doc = doc.strip()
        if not doc:
            continue
        output_doc = []
-        for sent in doc.split('\n\n'):
+        for sent in doc.split("\n\n"):
            sent = sent.strip()
            if not sent:
                continue
-            lines = [line.strip() for line in sent.split('\n') if line.strip()]
+            lines = [line.strip() for line in sent.split("\n") if line.strip()]
            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
            biluo_ents = iob_to_biluo(iob_ents)
-            output_doc.append({'tokens': [
+            output_doc.append(
-                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
+                {
-                zip(words, tags, biluo_ents)
+                    "tokens": [
-            ]})
+                        {"orth": w, "tag": tag, "ner": ent}
-        output_docs.append({
+                        for (w, tag, ent) in zip(words, tags, biluo_ents)
-            'id': len(output_docs),
+                    ]
-            'paragraphs': [{'sentences': output_doc}]
+                }
-        })
+            )
        output_docs.append(
            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
        )
        output_doc = []
    return output_docs
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,34 +1,27 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .._messages import Messages
 from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo
 import re
 from ...gold import iob_to_biluo
 def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
 def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
    """
    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.
    """
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
    """     
    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    """
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
    # by @katarkor
    docs = []
    sentences = []
-    conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
+    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
    checked_for_ner = False
    has_ner_tags = False
    for i, (raw_text, tokens) in enumerate(conll_tuples):
        sentence, brackets = tokens[0]
        if not checked_for_ner:
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
        sentences.append(generate_sentence(sentence, has_ner_tags))
        # Real-sized documents could be extracted using the comments on the
        # conluu document
-
+        if len(sentences) % n_sents == 0:
        if(len(sentences) % n_sents == 0):
            doc = create_doc(sentences, i)
            docs.append(doc)
            sentences = []
-
+    return docs
    output_filename = input_path.parts[-1].replace(".conll", ".json")
    output_filename = input_path.parts[-1].replace(".conllu", ".json")
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
    prints(Messages.M033.format(n_docs=len(docs)),
           title=Messages.M032.format(name=path2str(output_file)))
 def is_ner(tag):
    """ 
    Check the 10th column of the first token to determine if the file contains
    NER tags 
    """
-
+    Check the 10th column of the first token to determine if the file contains
-    tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
+    NER tags
    """
    tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
    if tag_match:
        return True
    elif tag == "O":
@ -67,29 +50,29 @@ def is_ner(tag):
    else:
        return False
-def read_conllx(input_path, use_morphology=False, n=0):
+
-    text = input_path.open('r', encoding='utf-8').read()
+def read_conllx(input_data, use_morphology=False, n=0):
    i = 0
-    for sent in text.strip().split('\n\n'):
+    for sent in input_data.strip().split("\n\n"):
-        lines = sent.strip().split('\n')
+        lines = sent.strip().split("\n")
        if lines:
-            while lines[0].startswith('#'):
+            while lines[0].startswith("#"):
                lines.pop(0)
            tokens = []
            for line in lines:
-                parts = line.split('\t')
+                parts = line.split("\t")
                id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
-                if '-' in id_ or '.' in id_:
+                if "-" in id_ or "." in id_:
                    continue
                try:
                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head != '0' else id_
+                    head = (int(head) - 1) if head != "0" else id_
-                    dep = 'ROOT' if dep == 'root' else dep
+                    dep = "ROOT" if dep == "root" else dep
-                    tag = pos if tag == '_' else tag
+                    tag = pos if tag == "_" else tag
-                    tag = tag+'__'+morph  if use_morphology else tag
+                    tag = tag + "__" + morph if use_morphology else tag
                    tokens.append((id_, word, tag, head, dep, iob))
-                except:
+                except:  # noqa: E722
                    print(line)
                    raise
            tuples = [list(t) for t in zip(*tokens)]
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
            if n >= 1 and i >= n:
                break
 def simplify_tags(iob):
    """
    Simplify tags obtained from the dataset in order to follow Wikipedia
    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
-    'MISC'.     
+    'MISC'.
    """
    new_iob = []
    for tag in iob:
-        tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
+        tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
        if tag_match:
            prefix = tag_match.group(1)
            suffix = tag_match.group(2)
-            if suffix == 'GPE_LOC':
+            if suffix == "GPE_LOC":
-                suffix = 'LOC'
+                suffix = "LOC"
-            elif suffix == 'GPE_ORG':
+            elif suffix == "GPE_ORG":
-                suffix = 'ORG'
+                suffix = "ORG"
-            elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG':
+            elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
-                suffix = 'MISC'
+                suffix = "MISC"
-            tag = prefix + '-' + suffix
+            tag = prefix + "-" + suffix
        new_iob.append(tag)
    return new_iob
 def generate_sentence(sent, has_ner_tags):
    (id_, word, tag, head, dep, iob) = sent
    sentence = {}
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
    return sentence
-def create_doc(sentences,id):
+def create_doc(sentences, id):
    doc = {}
    paragraph = {}
    doc["id"] = id
--- a/spacy/cli/converters/conllubio2json.py
+++ b/spacy/cli/converters/conllubio2json.py
@ -1,65 +1,54 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo
-def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
+
 def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
    """
    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.
    """
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
    docs = []
    sentences = []
-    conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
+    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
    for i, (raw_text, tokens) in enumerate(conll_tuples):
        sentence, brackets = tokens[0]
        sentences.append(generate_sentence(sentence))
        # Real-sized documents could be extracted using the comments on the
        # conluu document
-        if(len(sentences) % n_sents == 0):
+        if len(sentences) % n_sents == 0:
            doc = create_doc(sentences, i)
            docs.append(doc)
            sentences = []
-
+    return docs
    output_filename = input_path.parts[-1].replace(".conll", ".json")
    output_filename = input_path.parts[-1].replace(".conllu", ".json")
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
    prints("Created %d documents" % len(docs),
           title="Generated output file %s" % path2str(output_file))
-def read_conllx(input_path, use_morphology=False, n=0):
+def read_conllx(input_data, use_morphology=False, n=0):
    text = input_path.open('r', encoding='utf-8').read()
    i = 0
-    for sent in text.strip().split('\n\n'):
+    for sent in input_data.strip().split("\n\n"):
-        lines = sent.strip().split('\n')
+        lines = sent.strip().split("\n")
        if lines:
-            while lines[0].startswith('#'):
+            while lines[0].startswith("#"):
                lines.pop(0)
            tokens = []
            for line in lines:
-                parts = line.split('\t')
+                parts = line.split("\t")
                id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
-                if '-' in id_ or '.' in id_:
+                if "-" in id_ or "." in id_:
                    continue
                try:
                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head != '0' else id_
+                    head = (int(head) - 1) if head != "0" else id_
-                    dep = 'ROOT' if dep == 'root' else dep
+                    dep = "ROOT" if dep == "root" else dep
-                    tag = pos if tag == '_' else tag
+                    tag = pos if tag == "_" else tag
-                    tag = tag+'__'+morph  if use_morphology else tag
+                    tag = tag + "__" + morph if use_morphology else tag
-                    ner = ner if ner else 'O'
+                    ner = ner if ner else "O"
                    tokens.append((id_, word, tag, head, dep, ner))
-                except:
+                except:  # noqa: E722
                    print(line)
                    raise
            tuples = [list(t) for t in zip(*tokens)]
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
            if n >= 1 and i >= n:
                break
 def generate_sentence(sent):
    (id_, word, tag, head, dep, ner) = sent
    sentence = {}
@ -85,7 +75,7 @@ def generate_sentence(sent):
    return sentence
-def create_doc(sentences,id):
+def create_doc(sentences, id):
    doc = {}
    paragraph = {}
    doc["id"] = id
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,26 +1,24 @@
 # coding: utf8
 from __future__ import unicode_literals
 from cytoolz import partition_all, concat
-from .._messages import Messages
+from cytoolz import partition_all
-from ...compat import json_dumps, path2str
+
 from ...util import prints
 from ...gold import iob_to_biluo
-def iob2json(input_path, output_path, n_sents=10, *a, **k):
+def iob2json(input_data, n_sents=10, *args, **kwargs):
    """
    Convert IOB files into JSON format for use with train cli.
    """
-    with input_path.open('r', encoding='utf8') as file_:
+    docs = []
-        sentences = read_iob(file_)
+    for group in partition_all(n_sents, docs):
-    docs = merge_sentences(sentences, n_sents)
+        group = list(group)
-    output_filename = input_path.parts[-1].replace(".iob", ".json")
+        first = group.pop(0)
-    output_file = output_path / output_filename
+        to_extend = first["paragraphs"][0]["sentences"]
-    with output_file.open('w', encoding='utf-8') as f:
+        for sent in group[1:]:
-        f.write(json_dumps(docs))
+            to_extend.extend(sent["paragraphs"][0]["sentences"])
-    prints(Messages.M033.format(n_docs=len(docs)),
+        docs.append(first)
-           title=Messages.M032.format(name=path2str(output_file)))
+    return docs
 def read_iob(raw_sents):
@ -28,30 +26,20 @@ def read_iob(raw_sents):
    for line in raw_sents:
        if not line.strip():
            continue
-        tokens = [t.split('|') for t in line.split()]
+        tokens = [t.split("|") for t in line.split()]
        if len(tokens[0]) == 3:
            words, pos, iob = zip(*tokens)
        else:
            words, iob = zip(*tokens)
-            pos = ['-'] * len(words)
+            pos = ["-"] * len(words)
        biluo = iob_to_biluo(iob)
-        sentences.append([
+        sentences.append(
-            {'orth': w, 'tag': p, 'ner': ent}
+            [
-            for (w, p, ent) in zip(words, pos, biluo)
+                {"orth": w, "tag": p, "ner": ent}
-        ])
+                for (w, p, ent) in zip(words, pos, biluo)
-    sentences = [{'tokens': sent} for sent in sentences]
+            ]
-    paragraphs = [{'sentences': [sent]} for sent in sentences]
+        )
-    docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
+    sentences = [{"tokens": sent} for sent in sentences]
    paragraphs = [{"sentences": [sent]} for sent in sentences]
    docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
    return docs
 def merge_sentences(docs, n_sents):
    counter = 0
    merged = []
    for group in partition_all(n_sents, docs):
        group = list(group)
        first = group.pop(0)
        to_extend = first['paragraphs'][0]['sentences']
        for sent in group[1:]:
            to_extend.extend(sent['paragraphs'][0]['sentences'])
        merged.append(first)
    return merged
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -1,33 +1,21 @@
 # coding: utf8
 from __future__ import unicode_literals
 import ujson as json
 import ujson
 from ...util import get_lang_class
 from .._messages import Messages
 from ...compat import json_dumps, path2str
 from ...util import prints, get_lang_class
 from ...gold import docs_to_json
-def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
+def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
    if lang is None:
-        prints(Messages.M054, exits=True)
+        raise ValueError(Messages.M054)
    json_docs = []
-    input_tuples = list(read_jsonl(input_path))
+    input_tuples = [ujson.loads(line) for line in input_data]
    nlp = get_lang_class(lang)()
    for i, (raw_text, ents) in enumerate(input_tuples):
        doc = nlp.make_doc(raw_text)
        doc[0].is_sent_start = True
-        doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
+        doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
-        json_docs.append(docs_to_json(i, [doc]))
+        json_docs.append(doc.to_json())
-
+    return json_docs
    output_filename = input_path.parts[-1].replace(".jsonl", ".json")
    output_loc = output_path / output_filename
    with (output_loc).open('w', encoding='utf8') as file_:
        file_.write(json_dumps(json_docs))
    prints(Messages.M033.format(n_docs=len(json_docs)),
           title=Messages.M032.format(name=path2str(output_loc)))
 def read_jsonl(input_path):
    with input_path.open('r', encoding='utf8') as file_:
        for line in file_:
            yield json.loads(line)
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -0,0 +1,398 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function
 from pathlib import Path
 from collections import Counter
 import plac
 import sys
 from wasabi import Printer, MESSAGES
 from ..gold import GoldCorpus, read_json_object
 from ..util import load_model, get_lang_class, read_json, read_jsonl
 # from .schemas import get_schema, validate_json
 from ._messages import Messages
 # Minimum number of expected occurences of label in data to train new label
 NEW_LABEL_THRESHOLD = 50
 # Minimum number of expected examples to train a blank model
 BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000
@plac.annotations(
    lang=("model language", "positional", None, str),
    train_path=("location of JSON-formatted training data", "positional", None, Path),
    dev_path=("location of JSON-formatted development data", "positional", None, Path),
    base_model=("name of model to update (optional)", "option", "b", str),
    pipeline=(
        "Comma-separated names of pipeline components to train",
        "option",
        "p",
        str,
    ),
    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
    ignore_validation=(
        "Don't exit if JSON format validation fails",
        "flag",
        "IV",
        bool,
    ),
    verbose=("Print additional information and explanations", "flag", "V", bool),
    no_format=("Don't pretty-print the results", "flag", "NF", bool),
 )
 def debug_data(
    lang,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    ignore_validation=False,
    verbose=False,
    no_format=False,
 ):
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail(Messages.M050, train_path, exits=1)
    if not dev_path.exists():
        msg.fail(Messages.M051, dev_path, exits=1)
    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()
    msg.divider("Data format validation")
    # Load the data in one – might take a while but okay in this case
    with msg.loading("Loading {}...".format(train_path.parts[-1])):
        train_data = _load_file(train_path, msg)
    with msg.loading("Loading {}...".format(dev_path.parts[-1])):
        dev_data = _load_file(dev_path, msg)
    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    # schema = get_schema("training")
    train_data_errors = []  # TODO: validate_json(train_data, schema)
    dev_data_errors = []  # TODO: validate_json(dev_data, schema)
    if not train_data_errors:
        msg.good("Training data JSON format is valid")
    if not dev_data_errors:
        msg.good("Development data JSON format is valid")
    for error in train_data_errors:
        msg.fail("Training data: {}".format(error))
    for error in dev_data_errors:
        msg.fail("Develoment data: {}".format(error))
    if (train_data_errors or dev_data_errors) and not ignore_validation:
        sys.exit(1)
    # Create the gold corpus to be able to better analyze data
    with msg.loading("Analyzing corpus..."):
        train_data = read_json_object(train_data)
        dev_data = read_json_object(dev_data)
        corpus = GoldCorpus(train_data, dev_data)
        train_docs = list(corpus.train_docs(nlp))
        dev_docs = list(corpus.dev_docs(nlp))
    msg.good("Corpus is loadable")
    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_data = _compile_gold(train_docs, pipeline)
    train_texts = gold_data["texts"]
    dev_texts = set([doc.text for doc, gold in dev_docs])
    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn("{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs)
        )
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
            ),
            show=verbose,
        )
    msg.divider("Vocab & Vectors")
    n_words = gold_data["n_words"]
    msg.info(
        "{} total {} in the data ({} unique)".format(
            n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
        )
    )
    most_common_words = gold_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)
        ),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            "{} vectors ({} unique keys, {} dimensions)".format(
                len(nlp.vocab.vectors),
                nlp.vocab.vectors.n_keys,
                nlp.vocab.vectors_length,
            )
        )
    else:
        msg.info("No word vectors present in the model")
    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
        label_counts = gold_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        msg.divider("Named Entity Recognition")
        msg.info(
            "{} new {}, {} existing {}".format(
                len(new_labels),
                "label" if len(new_labels) == 1 else "labels",
                len(existing_labels),
                "label" if len(existing_labels) == 1 else "labels",
            )
        )
        missing_values = label_counts["-"]
        msg.text(
            "{} missing {} (tokens with '-' label)".format(
                missing_values, "value" if missing_values == 1 else "values"
            )
        )
        if new_labels:
            labels_with_counts = [
                (label, count)
                for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )
        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]
                    )
                )
                has_low_data_warning = True
                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(label)
                    )
                    has_no_neg_warning = True
        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurences available for all labels")
        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_data["textcat"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info(
            "Text Classification: {} new label(s), {} existing label(s)".format(
                len(new_labels), len(existing_labels)
            )
        )
        if new_labels:
            labels_with_counts = _format_labels(
                gold_data["textcat"].most_common(), counts=True
            )
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )
    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_data["tags"]]
        tag_map = nlp.Defaults.tag_map
        msg.info(
            "{} {} in data ({} {} in tag map)".format(
                len(labels),
                "label" if len(labels) == 1 else "labels",
                len(tag_map),
                "label" if len(tag_map) == 1 else "labels",
            )
        )
        labels_with_counts = _format_labels(
            gold_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang
                )
            )
    if "parser" in pipeline:
        msg.divider("Dependency Parsing")
        labels = [label for label in gold_data["deps"]]
        msg.info(
            "{} {} in data".format(
                len(labels), "label" if len(labels) == 1 else "labels"
            )
        )
        labels_with_counts = _format_labels(
            gold_data["deps"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            "{} {} passed".format(
                good_counts, "check" if good_counts == 1 else "checks"
            )
        )
    if warn_counts:
        msg.warn(
            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
        )
    if fail_counts:
        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
    if fail_counts:
        sys.exit(1)
 def _load_file(file_path, msg):
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
        data = read_json(file_path)
        msg.good("Loaded {}".format(file_name))
        return data
    elif file_path.suffix == ".jsonl":
        data = read_jsonl(file_path)
        msg.good("Loaded {}".format(file_name))
        return data
    msg.fail(
        "Can't load file extension {}".format(file_path.suffix),
        "Expected .json or .jsonl",
        exits=1,
    )
 def _compile_gold(train_docs, pipeline):
    data = {
        "ner": Counter(),
        "cats": Counter(),
        "tags": Counter(),
        "deps": Counter(),
        "words": Counter(),
        "n_words": 0,
        "texts": set(),
    }
    for doc, gold in train_docs:
        data["words"].update(gold.words)
        data["n_words"] += len(gold.words)
        data["texts"].add(doc.text)
        if "ner" in pipeline:
            for label in gold.ner:
                if label.startswith(("B-", "U-")):
                    combined_label = label.split("-")[1]
                    data["ner"][combined_label] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
        if "textcat" in pipeline:
            data["cats"].update(gold.cats)
        if "tagger" in pipeline:
            data["tags"].update(gold.tags)
        if "parser" in pipeline:
            data["deps"].update(gold.labels)
    return data
 def _format_labels(labels, counts=False):
    if counts:
        return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
    return ", ".join(["'{}'".format(l) for l in labels])
 def _get_ner_counts(data):
    counter = Counter()
    for doc, gold in data:
        for label in gold.ner:
            if label.startswith(("B-", "U-")):
                combined_label = label.split("-")[1]
                counter[combined_label] += 1
            elif label == "-":
                counter["-"] += 1
    return counter
 def _get_examples_without_label(data, label):
    count = 0
    for doc, gold in data:
        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
        if label not in labels:
            count += 1
    return count
 def _get_labels_from_model(nlp, pipe_name):
    if pipe_name not in nlp.pipe_names:
        return set()
    pipe = nlp.get_pipe(pipe_name)
    return pipe.labels
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -6,34 +6,37 @@ import requests
 import os
 import subprocess
 import sys
 from wasabi import Printer
 from ._messages import Messages
 from .link import link
-from ..util import prints, get_package_path
+from ..util import get_package_path
 from .. import about
 msg = Printer()
@plac.annotations(
-    model=("model to download, shortcut or name)", "positional", None, str),
+    model=("Model to download (shortcut or name)", "positional", None, str),
-    direct=("force direct download. Needs model name with version and won't "
+    direct=("Force direct download of name + version", "flag", "d", bool),
-            "perform compatibility check", "flag", "d", bool),
+    pip_args=("additional arguments to be passed to `pip install` on model install"),
-    pip_args=("additional arguments to be passed to `pip install` when "
+)
              "installing the model"))
 def download(model, direct=False, *pip_args):
    """
    Download compatible model from default download path using pip. Model
    can be shortcut, model name or, if --direct flag is set, full model name
-    with version.
+    with version. For direct downloads, the compatibility check will be skipped.
    """
    if direct:
-        dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args)
+        dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'
+        dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
-                            .format(m=model_name, v=version), pip_args)
+        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
        if dl != 0:  # if download subprocess doesn't return 0, exit
            sys.exit(dl)
        try:
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
            # subprocess
            package_path = get_package_path(model_name)
            link(model_name, model, force=True, model_path=package_path)
-        except:
+        except:  # noqa: E722
            # Dirty, but since spacy.download and the auto-linking is
            # mostly a convenience wrapper, it's best to show a success
            # message and loading instructions, even if linking fails.
-            prints(Messages.M001, title=Messages.M002.format(name=model_name))
+            msg.warn(Messages.M002.format(name=model_name), Messages.M001)
 def get_json(url, desc):
    r = requests.get(url)
    if r.status_code != 200:
-        prints(Messages.M004.format(desc=desc, version=about.__version__),
+        msg.fail(
-               title=Messages.M003.format(code=r.status_code), exits=1)
+            Messages.M003.format(code=r.status_code),
            Messages.M004.format(desc=desc, version=about.__version__),
            exits=1,
        )
    return r.json()
 def get_compatibility():
    version = about.__version__
-    version = version.rsplit('.dev', 1)[0]
+    version = version.rsplit(".dev", 1)[0]
    comp_table = get_json(about.__compatibility__, "compatibility table")
-    comp = comp_table['spacy']
+    comp = comp_table["spacy"]
    if version not in comp:
-        prints(Messages.M006.format(version=version), title=Messages.M005,
+        msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
               exits=1)
    return comp[version]
 def get_version(model, comp):
-    model = model.rsplit('.dev', 1)[0]
+    model = model.rsplit(".dev", 1)[0]
    if model not in comp:
-        prints(Messages.M007.format(name=model, version=about.__version__),
+        msg.fail(
-               title=Messages.M005, exits=1)
+            Messages.M005,
            Messages.M007.format(name=model, version=about.__version__),
            exits=1,
        )
    return comp[model][0]
 def download_model(filename, user_pip_args=None):
-    download_url = about.__download_url__ + '/' + filename
+    download_url = about.__download_url__ + "/" + filename
-    pip_args = ['--no-cache-dir', '--no-deps']
+    pip_args = ["--no-cache-dir", "--no-deps"]
    if user_pip_args:
        pip_args.extend(user_pip_args)
-    cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url]
+    cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
    return subprocess.call(cmd, env=os.environ.copy())
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function
 import plac
 from timeit import default_timer as timer
 from wasabi import Printer
 from ._messages import Messages
 from ..gold import GoldCorpus
 from ..util import prints
 from .. import util
 from .. import displacy
@plac.annotations(
-    model=("model name or path", "positional", None, str),
+    model=("Model name or path", "positional", None, str),
-    data_path=("location of JSON-formatted evaluation data", "positional",
+    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
-               None, str),
+    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
-    gold_preproc=("use gold preprocessing", "flag", "G", bool),
+    gpu_id=("Use GPU", "option", "g", int),
-    gpu_id=("use GPU", "option", "g", int),
+    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
-    displacy_path=("directory to output rendered parses as HTML", "option",
+    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
-                   "dp", str),
+)
-    displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
+def evaluate(
-def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None,
+    model,
-             displacy_limit=25):
+    data_path,
    gpu_id=-1,
    gold_preproc=False,
    displacy_path=None,
    displacy_limit=25,
 ):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
-
+    msg = Printer()
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
-        prints(data_path, title=Messages.M034, exits=1)
+        msg.fail(Messages.M034, data_path, exits=1)
    if displacy_path and not displacy_path.exists():
-        prints(displacy_path, title=Messages.M035, exits=1)
+        msg.fail(Messages.M035, displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
    scorer = nlp.evaluate(dev_docs, verbose=False)
    end = timer()
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
-    print_results(scorer, time=end - begin, words=nwords,
+    results = {
-                  wps=nwords / (end - begin))
+        "Time": "%.2f s" % end - begin,
        "Words": nwords,
        "Words/s": "%.0f" % nwords / (end - begin),
        "TOK": "%.2f" % scorer.token_acc,
        "POS": "%.2f" % scorer.tags_acc,
        "UAS": "%.2f" % scorer.uas,
        "LAS": "%.2f" % scorer.las,
        "NER P": "%.2f" % scorer.ents_p,
        "NER R": "%.2f" % scorer.ents_r,
        "NER F": "%.2f" % scorer.ents_f,
    }
    msg.table(results, title="Results")
    if displacy_path:
        docs, golds = zip(*dev_docs)
-        render_deps = 'parser' in nlp.meta.get('pipeline', [])
+        render_deps = "parser" in nlp.meta.get("pipeline", [])
-        render_ents = 'ner' in nlp.meta.get('pipeline', [])
+        render_ents = "ner" in nlp.meta.get("pipeline", [])
-        render_parses(docs, displacy_path, model_name=model,
+        render_parses(
-                      limit=displacy_limit, deps=render_deps, ents=render_ents)
+            docs,
-        prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
+            displacy_path,
            model_name=model,
            limit=displacy_limit,
            deps=render_deps,
            ents=render_ents,
        )
        msg.good(Messages.M036.format(n=displacy_limit), displacy_path)
-def render_parses(docs, output_path, model_name='', limit=250, deps=True,
+def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
-                  ents=True):
+    docs[0].user_data["title"] = model_name
    docs[0].user_data['title'] = model_name
    if ents:
-        with (output_path / 'entities.html').open('w') as file_:
+        with (output_path / "entities.html").open("w") as file_:
-            html = displacy.render(docs[:limit], style='ent', page=True)
+            html = displacy.render(docs[:limit], style="ent", page=True)
            file_.write(html)
    if deps:
-        with (output_path / 'parses.html').open('w') as file_:
+        with (output_path / "parses.html").open("w") as file_:
-            html = displacy.render(docs[:limit], style='dep', page=True,
+            html = displacy.render(
-                                   options={'compact': True})
+                docs[:limit], style="dep", page=True, options={"compact": True}
            )
            file_.write(html)
 def print_progress(itn, losses, dev_scores, wps=0.0):
    scores = {}
-    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
+    for col in [
-                'ents_p', 'ents_r', 'ents_f', 'wps']:
+        "dep_loss",
        "tag_loss",
        "uas",
        "tags_acc",
        "token_acc",
        "ents_p",
        "ents_r",
        "ents_f",
        "wps",
    ]:
        scores[col] = 0.0
-    scores['dep_loss'] = losses.get('parser', 0.0)
+    scores["dep_loss"] = losses.get("parser", 0.0)
-    scores['ner_loss'] = losses.get('ner', 0.0)
+    scores["ner_loss"] = losses.get("ner", 0.0)
-    scores['tag_loss'] = losses.get('tagger', 0.0)
+    scores["tag_loss"] = losses.get("tagger", 0.0)
    scores.update(dev_scores)
-    scores['wps'] = wps
+    scores["wps"] = wps
-    tpl = '\t'.join((
+    tpl = "\t".join(
-        '{:d}',
+        (
-        '{dep_loss:.3f}',
+            "{:d}",
-        '{ner_loss:.3f}',
+            "{dep_loss:.3f}",
-        '{uas:.3f}',
+            "{ner_loss:.3f}",
-        '{ents_p:.3f}',
+            "{uas:.3f}",
-        '{ents_r:.3f}',
+            "{ents_p:.3f}",
-        '{ents_f:.3f}',
+            "{ents_r:.3f}",
-        '{tags_acc:.3f}',
+            "{ents_f:.3f}",
-        '{token_acc:.3f}',
+            "{tags_acc:.3f}",
-        '{wps:.1f}'))
+            "{token_acc:.3f}",
            "{wps:.1f}",
        )
    )
    print(tpl.format(itn, **scores))
 def print_results(scorer, time, words, wps):
    results = {
        'Time': '%.2f s' % time,
        'Words': words,
        'Words/s': '%.0f' % wps,
        'TOK': '%.2f' % scorer.token_acc,
        'POS': '%.2f' % scorer.tags_acc,
        'UAS': '%.2f' % scorer.uas,
        'LAS': '%.2f' % scorer.las,
        'NER P': '%.2f' % scorer.ents_p,
        'NER R': '%.2f' % scorer.ents_r,
        'NER F': '%.2f' % scorer.ents_f}
    util.print_table(results, title="Results")
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import plac
 import platform
 from pathlib import Path
 from wasabi import Printer
 from ._messages import Messages
 from ..compat import path2str
@ -12,56 +13,65 @@ from .. import about
@plac.annotations(
-    model=("optional: shortcut link of model", "positional", None, str),
+    model=("Optional shortcut link of model", "positional", None, str),
-    markdown=("generate Markdown for GitHub issues", "flag", "md", str),
+    markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
-    silent=("don't print anything (just return)", "flag", "s"))
+    silent=("Don't print anything (just return)", "flag", "s"),
 )
 def info(model=None, markdown=False, silent=False):
-    """Print info about spaCy installation. If a model shortcut link is
+    """
    Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
    msg = Printer()
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
        else:
            model_path = util.get_data_path() / model
-        meta_path = model_path / 'meta.json'
+        meta_path = model_path / "meta.json"
        if not meta_path.is_file():
-            util.prints(meta_path, title=Messages.M020, exits=1)
+            msg.fail(Messages.M020, meta_path, exits=1)
        meta = util.read_json(meta_path)
        if model_path.resolve() != model_path:
-            meta['link'] = path2str(model_path)
+            meta["link"] = path2str(model_path)
-            meta['source'] = path2str(model_path.resolve())
+            meta["source"] = path2str(model_path.resolve())
        else:
-            meta['source'] = path2str(model_path)
+            meta["source"] = path2str(model_path)
        if not silent:
-            print_info(meta, 'model %s' % model, markdown)
+            title = "Info about model '{}'".format(model)
            model_meta = {
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
            }
            if markdown:
                util.print_markdown(model_meta, title=title)
            else:
                msg.table(model_meta, title=title)
        return meta
-    data = {'spaCy version': about.__version__,
+    data = {
-            'Location': path2str(Path(__file__).parent.parent),
+        "spaCy version": about.__version__,
-            'Platform': platform.platform(),
+        "Location": path2str(Path(__file__).parent.parent),
-            'Python version': platform.python_version(),
+        "Platform": platform.platform(),
-            'Models': list_models()}
+        "Python version": platform.python_version(),
        "Models": list_models(),
    }
    if not silent:
-        print_info(data, 'spaCy', markdown)
+        title = "Info about spaCy"
        if markdown:
            util.print_markdown(data, title=title)
        else:
            msg.table(data, title=title)
    return data
 def print_info(data, title, markdown):
    title = 'Info about %s' % title
    if markdown:
        util.print_markdown(data, title=title)
    else:
        util.print_table(data, title=title)
 def list_models():
    def exclude_dir(dir_name):
        # exclude common cache directories and hidden directories
-        exclude = ['cache', 'pycache', '__pycache__']
+        exclude = ("cache", "pycache", "__pycache__")
-        return dir_name in exclude or dir_name.startswith('.')
+        return dir_name in exclude or dir_name.startswith(".")
    data_path = util.get_data_path()
    if data_path:
        models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
-        return ', '.join([m for m in models if not exclude_dir(m)])
+        return ", ".join([m for m in models if not exclude_dir(m)])
-    return '-'
+    return "-"
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
 import tarfile
 import gzip
 import zipfile
-import ujson as json
+from wasabi import Printer
 from spacy.lexeme import intify_attrs
 from ._messages import Messages
 from ..vectors import Vectors
 from ..errors import Errors, Warnings, user_warning
-from ..util import prints, ensure_path, get_lang_class
+from ..util import ensure_path, get_lang_class, read_jsonl
 try:
    import ftfy
@ -25,121 +24,133 @@ except ImportError:
    ftfy = None
 msg = Printer()
@plac.annotations(
-    lang=("model language", "positional", None, str),
+    lang=("Model language", "positional", None, str),
-    output_dir=("model output directory", "positional", None, Path),
+    output_dir=("Model output directory", "positional", None, Path),
-    freqs_loc=("location of words frequencies file", "option", "f", Path),
+    freqs_loc=("Location of words frequencies file", "option", "f", Path),
-    jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path),
+    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
-    clusters_loc=("optional: location of brown clusters data",
+    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
-                  "option", "c", str),
+    vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
-    vectors_loc=("optional: location of vectors file in Word2Vec format "
+    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
                 "(either as .txt or zipped as .zip or .tar.gz)", "option",
                 "v", str),
    prune_vectors=("optional: number of vectors to prune to",
                   "option", "V", int)
 )
-def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None,
+def init_model(
-               vectors_loc=None, prune_vectors=-1):
+    lang,
    output_dir,
    freqs_loc=None,
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    prune_vectors=-1,
 ):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
-    and word vectors.
+    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
-            settings = ['-j']
+            settings = ["-j"]
            if freqs_loc:
-                settings.append('-f')
+                settings.append("-f")
            if clusters_loc:
-                settings.append('-c')
+                settings.append("-c")
-            prints(' '.join(settings),
+            msg.warn(Messages.M063, Messages.M064)
                title=(
                    "The -f and -c arguments are deprecated, and not compatible "
                    "with the -j argument, which should specify the same information. "
                    "Either merge the frequencies and clusters data into the "
                    "jsonl-formatted file (recommended), or use only the -f and "
                    "-c files, without the other lexical attributes."))
        jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = (json.loads(line) for line in jsonl_loc.open())
+        lex_attrs = read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
-            prints(freqs_loc, title=Messages.M037, exits=1)
+            msg.fail(Messages.M037, freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
-    nlp = create_model(lang, lex_attrs)
+    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs)
    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, prune_vectors)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
-    prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
+    msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
           title=Messages.M038)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp
 def open_file(loc):
-    '''Handle .gz, .tar.gz or unzipped files'''
+    """Handle .gz, .tar.gz or unzipped files"""
    loc = ensure_path(loc)
    print("Open loc")
    if tarfile.is_tarfile(str(loc)):
-        return tarfile.open(str(loc), 'r:gz')
+        return tarfile.open(str(loc), "r:gz")
-    elif loc.parts[-1].endswith('gz'):
+    elif loc.parts[-1].endswith("gz"):
-        return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
+        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
-    elif loc.parts[-1].endswith('zip'):
+    elif loc.parts[-1].endswith("zip"):
        zip_file = zipfile.ZipFile(str(loc))
        names = zip_file.namelist()
        file_ = zip_file.open(names[0])
-        return (line.decode('utf8') for line in file_)
+        return (line.decode("utf8") for line in file_)
    else:
-        return loc.open('r', encoding='utf8')
+        return loc.open("r", encoding="utf8")
 def read_attrs_from_deprecated(freqs_loc, clusters_loc):
-    probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
+    with msg.loading("Counting frequencies..."):
-    clusters = read_clusters(clusters_loc) if clusters_loc else {}
+        probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
    msg.good("Counted frequencies")
    with msg.loading("Reading clusters..."):
        clusters = read_clusters(clusters_loc) if clusters_loc else {}
    msg.good("Read clusters")
    lex_attrs = []
    sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
    for i, (word, prob) in tqdm(enumerate(sorted_probs)):
-        attrs = {'orth': word, 'id': i, 'prob': prob}
+        attrs = {"orth": word, "id": i, "prob": prob}
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
-            attrs['cluster'] = int(clusters[word][::-1], 2)
+            attrs["cluster"] = int(clusters[word][::-1], 2)
        else:
-            attrs['cluster'] = 0
+            attrs["cluster"] = 0
        lex_attrs.append(attrs)
    return lex_attrs
 def create_model(lang, lex_attrs):
    print("Creating model...")
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0
    lex_added = 0
    for attrs in lex_attrs:
-        if 'settings' in attrs:
+        if "settings" in attrs:
            continue
-        lexeme = nlp.vocab[attrs['orth']]
+        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)
        lexeme.is_oov = False
        lex_added += 1
        lex_added += 1
    oov_prob = min(lex.prob for lex in nlp.vocab)
-    nlp.vocab.cfg.update({'oov_prob': oov_prob-1})
+    nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
    return nlp
 def add_vectors(nlp, vectors_loc, prune_vectors):
    vectors_loc = ensure_path(vectors_loc)
-    if vectors_loc and vectors_loc.parts[-1].endswith('.npz'):
+    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
-        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb')))
+        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
        for lex in nlp.vocab:
            if lex.rank:
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
-        vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
+        if vectors_loc:
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
                vectors_data, vector_keys = read_vectors(vectors_loc)
            msg.good("Loaded vectors from {}".format(vectors_loc))
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
            for word in vector_keys:
                if word not in nlp.vocab:
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
                    lexeme.is_oov = False
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
-    nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang']
+    nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
-    nlp.meta['vectors']['name'] = nlp.vocab.vectors.name
+    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
    if prune_vectors >= 1:
        nlp.vocab.prune_vectors(prune_vectors)
 def read_vectors(vectors_loc):
    print("Reading vectors from %s" % vectors_loc)
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
-    vectors_data = numpy.zeros(shape=shape, dtype='f')
+    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
        line = line.rstrip()
-        pieces = line.rsplit(' ', vectors_data.shape[1]+1)
+        pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
        word = pieces.pop(0)
        if len(pieces) != vectors_data.shape[1]:
-            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
+            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
-        vectors_data[i] = numpy.asarray(pieces, dtype='f')
+        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
    return vectors_data, vectors_keys
 def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    print("Counting frequencies...")
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
-            freq, doc_freq, key = line.rstrip().split('\t', 2)
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
-            freq, doc_freq, key = line.rstrip().split('\t', 2)
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
 def read_clusters(clusters_loc):
    print("Reading clusters...")
    clusters = {}
    if ftfy is None:
        user_warning(Warnings.W004)
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
            if int(freq) >= 3:
                clusters[word] = cluster
            else:
-                clusters[word] = '0'
+                clusters[word] = "0"
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -3,51 +3,54 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path
 from wasabi import Printer
 from ._messages import Messages
 from ..compat import symlink_to, path2str
 from ..util import prints
 from .. import util
@plac.annotations(
    origin=("package name or local path to model", "positional", None, str),
    link_name=("name of shortuct link to create", "positional", None, str),
-    force=("force overwriting of existing link", "flag", "f", bool))
+    force=("force overwriting of existing link", "flag", "f", bool),
 )
 def link(origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    msg = Printer()
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
-        prints(Messages.M009.format(path=path2str(model_path)),
+        msg.fail(
-               title=Messages.M008, exits=1)
+            Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
        )
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
-        prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
+        msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
-        prints(Messages.M013, title=Messages.M012.format(name=link_name),
+        msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
               exits=1)
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
-    elif link_path.exists(): # does it exist otherwise?
+    elif link_path.exists():  # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
-        prints(Messages.M015, link_path,
+        msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
-               title=Messages.M014.format(name=link_name), exits=1)
+    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
    msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
-    except:
+    except:  # noqa: E722
        # This is quite dirty, but just making sure other errors are caught.
-        prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
+        msg.fail(Messages.M016.format(name=link_name), Messages.M017)
        msg.text(details)
        raise
-    prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
+    msg.good(Messages.M018, details)
    msg.text(Messages.M019.format(name=link_name))
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -4,109 +4,106 @@ from __future__ import unicode_literals
 import plac
 import shutil
 from pathlib import Path
 from wasabi import Printer, get_raw_input
 from ._messages import Messages
 from ..compat import path2str, json_dumps
 from ..util import prints
 from .. import util
 from .. import about
@plac.annotations(
-    input_dir=("directory with model data", "positional", None, str),
+    input_dir=("Directory with model data", "positional", None, str),
-    output_dir=("output parent directory", "positional", None, str),
+    output_dir=("Output parent directory", "positional", None, str),
-    meta_path=("path to meta.json", "option", "m", str),
+    meta_path=("Path to meta.json", "option", "m", str),
-    create_meta=("create meta.json, even if one exists in directory – if "
+    create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
-                 "existing meta is found, entries are shown as defaults in "
+    force=("Force overwriting existing model in output directory", "flag", "f", bool),
-                 "the command line prompt", "flag", "c", bool),
+)
-    force=("force overwriting of existing model directory in output directory",
+def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
           "flag", "f", bool))
 def package(input_dir, output_dir, meta_path=None, create_meta=False,
            force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
-    output directory, and model data will be copied over.
+    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
-        prints(input_path, title=Messages.M008, exits=1)
+        msg.fail(Messages.M008, input_path, exits=1)
    if not output_path or not output_path.exists():
-        prints(output_path, title=Messages.M040, exits=1)
+        msg.fail(Messages.M040, output_path, exits=1)
    if meta_path and not meta_path.exists():
-        prints(meta_path, title=Messages.M020, exits=1)
+        msg.fail(Messages.M020, meta_path, exits=1)
-    meta_path = meta_path or input_path / 'meta.json'
+    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = util.read_json(meta_path)
-        if not create_meta:  # only print this if user doesn't want to overwrite
+        if not create_meta:  # only print if user doesn't want to overwrite
-            prints(meta_path, title=Messages.M041)
+            msg.good(Messages.M041, meta_path)
        else:
-            meta = generate_meta(input_dir, meta)
+            meta = generate_meta(input_dir, meta, msg)
-    meta = validate_meta(meta, ['lang', 'name', 'version'])
+    for key in ("lang", "name", "version"):
-    model_name = meta['lang'] + '_' + meta['name']
+        if key not in meta or meta[key] == "":
-    model_name_v = model_name + '-' + meta['version']
+            msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name
    create_dirs(package_path, force)
    shutil.copytree(path2str(input_path),
                    path2str(package_path / model_name_v))
    create_file(main_path / 'meta.json', json_dumps(meta))
    create_file(main_path / 'setup.py', TEMPLATE_SETUP)
    create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
    create_file(package_path / '__init__.py', TEMPLATE_INIT)
    prints(main_path, Messages.M043,
           title=Messages.M042.format(name=model_name_v))
 def create_dirs(package_path, force):
    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
-            prints(package_path, Messages.M045, title=Messages.M044, exits=1)
+            msg.fail(
                Messages.M044,
                Messages.M045.format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", json_dumps(meta))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good(Messages.M042.format(name=model_name_v), main_path)
    msg.text(Messages.M043)
 def create_file(file_path, contents):
    file_path.touch()
-    file_path.open('w', encoding='utf-8').write(contents)
+    file_path.open("w", encoding="utf-8").write(contents)
-def generate_meta(model_path, existing_meta):
+def generate_meta(model_path, existing_meta, msg):
    meta = existing_meta or {}
-    settings = [('lang', 'Model language', meta.get('lang', 'en')),
+    settings = [
-                ('name', 'Model name', meta.get('name', 'model')),
+        ("lang", "Model language", meta.get("lang", "en")),
-                ('version', 'Model version', meta.get('version', '0.0.0')),
+        ("name", "Model name", meta.get("name", "model")),
-                ('spacy_version', 'Required spaCy version',
+        ("version", "Model version", meta.get("version", "0.0.0")),
-                 '>=%s,<3.0.0' % about.__version__),
+        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
-                ('description', 'Model description',
+        ("description", "Model description", meta.get("description", False)),
-                  meta.get('description', False)),
+        ("author", "Author", meta.get("author", False)),
-                ('author', 'Author', meta.get('author', False)),
+        ("email", "Author email", meta.get("email", False)),
-                ('email', 'Author email', meta.get('email', False)),
+        ("url", "Author website", meta.get("url", False)),
-                ('url', 'Author website', meta.get('url', False)),
+        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
-                ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
+    ]
    nlp = util.load_model_from_path(Path(model_path))
-    meta['pipeline'] = nlp.pipe_names
+    meta["pipeline"] = nlp.pipe_names
-    meta['vectors'] = {'width': nlp.vocab.vectors_length,
+    meta["vectors"] = {
-                       'vectors': len(nlp.vocab.vectors),
+        "width": nlp.vocab.vectors_length,
-                       'keys': nlp.vocab.vectors.n_keys}
+        "vectors": len(nlp.vocab.vectors),
-    prints(Messages.M047, title=Messages.M046)
+        "keys": nlp.vocab.vectors.n_keys,
    }
    msg.divider(Messages.M046)
    msg.text(Messages.M047)
    for setting, desc, default in settings:
-        response = util.get_raw_input(desc, default)
+        response = get_raw_input(desc, default)
-        meta[setting] = default if response == '' and default else response
+        meta[setting] = default if response == "" and default else response
-    if about.__title__ != 'spacy':
+    if about.__title__ != "spacy":
-        meta['parent_package'] = about.__title__
+        meta["parent_package"] = about.__title__
    return meta
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
            prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
    return meta
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,66 +1,148 @@
-'''This script is experimental.
+# coding: utf8
 Try pre-training the CNN component of the text categorizer using a cheap
 language modelling-like objective. Specifically, we load pre-trained vectors
 (from something like word2vec, GloVe, FastText etc), and use the CNN to
 predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
 we're not merely doing compression here, because heavy dropout is applied,
 including over the input words. This means the model must often (50% of the time)
 use the context in order to predict the word.
 To evaluate the technique, we're pre-training with the 50k texts from the IMDB
 corpus, and then training with only 100 labels. Note that it's a bit dirty to
 pre-train with the development data, but also not *so* terrible: we're not using
 the development labels, after all --- only the unlabelled text.
 '''
 from __future__ import print_function, unicode_literals
 import plac
 import random
 import numpy
 import time
-import ujson as json
+import ujson
 from pathlib import Path
 import sys
 from collections import Counter
-
+from pathlib import Path
 import spacy
 from spacy.tokens import Doc
 from spacy.attrs import ID, HEAD
 from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
 from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from thinc.v2v import Affine, Maxout
 from thinc.api import wrap
 from thinc.misc import LayerNorm as LN
 from thinc.neural.util import prefer_gpu
 from wasabi import Printer
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from ..compat import json_dumps
 from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from .. import util
-def prefer_gpu():
+@plac.annotations(
-    used = spacy.util.use_gpu(0)
+    texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
-    if used is None:
+    vectors_model=("Name or path to vectors model to learn from"),
-        return False
+    output_dir=("Directory to write models each epoch", "positional", None, str),
-    else:
+    width=("Width of CNN layers", "option", "cw", int),
-        import cupy.random
+    depth=("Depth of CNN layers", "option", "cd", int),
-        cupy.random.seed(0)
+    embed_rows=("Embedding rows", "option", "er", int),
-        return True
+    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
    dropout=("Dropout", "option", "d", float),
    seed=("Seed for random number generators", "option", "s", float),
    nr_iter=("Number of iterations to pretrain", "option", "i", int),
 )
 def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    use_vectors=False,
    dropout=0.2,
    nr_iter=1000,
    seed=0,
 ):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.
    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.
-def load_texts(path):
+    To load the weights back in during 'spacy train', you need to ensure
-    '''Load inputs from a jsonl file.
+    all settings are the same between pretraining and training. The API and
-    
+    errors around this need some improvement.
-    Each line should be a dict like {"text": "..."}
+    """
-    '''
+    config = dict(locals())
-    path = ensure_path(path)
+    msg = Printer()
-    with path.open('r', encoding='utf8') as file_:
+    util.fix_random_seed(seed)
-        texts = [json.loads(line) for line in file_]
+
-    random.shuffle(texts)
+    has_gpu = prefer_gpu()
-    return texts
+    msg.info("Using GPU" if has_gpu else "Not using GPU")
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    util.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")
    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(util.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = stream_texts()
    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=2,  # You can try setting this higher
            subword_features=True,
        ),
    )  # Set to False for character models, e.g. Chinese
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker()
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
    for epoch in range(nr_iter):
        for batch in util.minibatch_by_words(
            ((text, None) for text in texts), size=5000
        ):
            docs = make_docs(nlp, [text for (text, _) in batch])
            loss = make_update(model, docs, optimizer, drop=dropout)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(json_dumps(log) + "\n")
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
 def stream_texts():
    for line in sys.stdin:
-        yield json.loads(line)
+        yield ujson.loads(line)
-def make_update(model, docs, optimizer, drop=0.):
+def make_update(model, docs, optimizer, drop=0.0):
    """Perform an update over a single batch of documents.
    docs (iterable): A batch of `Doc` objects.
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
-    loss = float((gradients**2).mean())
+    loss = float((gradients ** 2).mean())
    return loss
@ -98,7 +180,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
 def get_vectors_loss(ops, docs, prediction):
    """Compute a mean-squared error loss between the documents' vectors and
-    the prediction.    
+    the prediction.
    Note that this is ripe for customization! We could compute the vectors
    in some other word, e.g. with an LSTM language model, or use some other
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):
 def create_pretraining_model(nlp, tok2vec):
-    '''Define a network for the pretraining. We simply add an output layer onto
+    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
-    '''
+    """
    output_size = nlp.vocab.vectors.data.shape[1]
    output_layer = chain(
-        LN(Maxout(300, pieces=3)),
+        LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
        zero_init(Affine(output_size, drop_factor=0.0))
    )
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann
    # "tok2vec" has to be the same set of processes as what the components do.
    tok2vec = chain(tok2vec, flatten)
-    model = chain(
+    model = chain(tok2vec, output_layer)
        tok2vec,
        output_layer
    )
    model = masked_language_model(nlp.vocab, model)
    model.tok2vec = tok2vec
    model.output_layer = output_layer
-    model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
+    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
    return model
 def masked_language_model(vocab, model, mask_prob=0.15):
-    '''Convert a model into a BERT-style masked language model'''
+    """Convert a model into a BERT-style masked language model"""
    random_words = RandomWords(vocab)
-    def mlm_forward(docs, drop=0.):
+
    def mlm_forward(docs, drop=0.0):
        mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
        output, backprop = model.begin_update(docs, drop=drop)
        def mlm_backward(d_output, sgd=None):
-            d_output *= 1-mask
+            d_output *= 1 - mask
            return backprop(d_output, sgd=sgd)
        return output, mlm_backward
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
 def apply_mask(docs, random_words, mask_prob=0.15):
    N = sum(len(doc) for doc in docs)
-    mask = numpy.random.uniform(0., 1.0, (N,))
+    mask = numpy.random.uniform(0.0, 1.0, (N,))
    mask = mask >= mask_prob
    i = 0
    masked_docs = []
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
    return mask, masked_docs
-def replace_word(word, random_words, mask='[MASK]'):
+def replace_word(word, random_words, mask="[MASK]"):
    roll = random.random()
    if roll < 0.8:
        return mask
@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'):
    else:
        return word
 class RandomWords(object):
    def __init__(self, vocab):
        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
        self.words = self.words[:10000]
        self.probs = self.probs[:10000]
-        self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
+        self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
        self.probs /= self.probs.sum()
        self._cache = []
    def next(self):
        if not self._cache:
-            self._cache.extend(numpy.random.choice(len(self.words), 10000,
+            self._cache.extend(
-                p=self.probs))
+                numpy.random.choice(len(self.words), 10000, p=self.probs)
            )
        index = self._cache.pop()
        return self.words[index]
- 
+
 class ProgressTracker(object):
    def __init__(self, frequency=1000000):
@ -245,76 +326,3 @@ class ProgressTracker(object):
            return status
        else:
            return None
@plac.annotations(
    texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
    vectors_model=("Name or path to vectors model to learn from"),
    output_dir=("Directory to write models each epoch", "positional", None, str),
    width=("Width of CNN layers", "option", "cw", int),
    depth=("Depth of CNN layers", "option", "cd", int),
    embed_rows=("Embedding rows", "option", "er", int),
    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
    dropout=("Dropout", "option", "d", float),
    seed=("Seed for random number generators", "option", "s", float),
    nr_iter=("Number of iterations to pretrain", "option", "i", int),
 )
 def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
        embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.
    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.
    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    output_dir = ensure_path(output_dir)
    random.seed(seed)
    numpy.random.seed(seed)
    if not output_dir.exists():
        output_dir.mkdir()
    with (output_dir / 'config.json').open('w') as file_:
        file_.write(json.dumps(config))
    has_gpu = prefer_gpu()
    print("Use GPU?", has_gpu)
    nlp = spacy.load(vectors_model)
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(nlp,
                Tok2Vec(width, embed_rows,
                    conv_depth=depth,
                    pretrained_vectors=pretrained_vectors,
                    bilstm_depth=0, # Requires PyTorch. Experimental.
                    cnn_maxout_pieces=2, # You can try setting this higher
                    subword_features=True)) # Set to False for character models, e.g. Chinese
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker()
    print('Epoch', '#Words', 'Loss', 'w/s')
    texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) 
    for epoch in range(nr_iter):
        for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
            docs = make_docs(nlp, [text for (text, _) in batch])
            loss = make_update(model, docs, optimizer, drop=dropout)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                print(*progress)
                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
                    break
        with model.use_params(optimizer.averages):
            with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
                file_.write(model.tok2vec.to_bytes())
            with (output_dir / 'log.jsonl').open('a') as file_:
                file_.write(json.dumps({'nr_word': tracker.nr_word,
                    'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
                    'epoch': epoch}) + '\n')
        tracker.epoch_loss = 0.0
        if texts_loc != '-':
            texts = load_texts(texts_loc)
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -6,45 +6,64 @@ from pathlib import Path
 import ujson
 import cProfile
 import pstats
 import spacy
 import sys
 import tqdm
 import cytoolz
 import thinc.extra.datasets
 from wasabi import Printer
-
+from ..util import load_model
 def read_inputs(loc):
    if loc is None:
        file_ = sys.stdin
        file_ = (line.encode('utf8') for line in file_)
    else:
        file_ = Path(loc).open()
    for line in file_:
        data = ujson.loads(line)
        text = data['text']
        yield text
@plac.annotations(
-    lang=("model/language", "positional", None, str),
+    model=("Model to load", "positional", None, str),
-    inputs=("Location of input file", "positional", None, read_inputs))
+    inputs=("Location of input file. '-' for stdin.", "positional", None, str),
-def profile(lang, inputs=None):
+    n_texts=("Maximum number of texts to use if available", "option", "n", int),
 )
 def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    msg = Printer()
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
-        imdb_train, _ = thinc.extra.datasets.imdb()
+        n_inputs = 25000
-        inputs, _ = zip(*imdb_train)
+        with msg.loading("Loading IMDB dataset via Thinc..."):
-        inputs = inputs[:25000]
+            imdb_train, _ = thinc.extra.datasets.imdb()
-    nlp = spacy.load(lang)
+            inputs, _ = zip(*imdb_train)
-    texts = list(cytoolz.take(10000, inputs))
+        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
-    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
+        inputs = inputs[:n_inputs]
-                    "Profile.prof")
+    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(cytoolz.take(n_texts, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
 def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass
 def _read_inputs(loc, msg):
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
        file_ = (line.encode("utf8") for line in file_)
    else:
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
        msg.info("Using data from {}".format(input_path.parts[-1]))
        file_ = input_path.open()
    for line in file_:
        data = ujson.loads(line)
        text = data["text"]
        yield text
--- a/spacy/cli/schemas/init.py
+++ b/spacy/cli/schemas/init.py
@ -0,0 +1,51 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from pathlib import Path
 from jsonschema import Draft4Validator
 from ...errors import Errors
 from ...util import read_json
 SCHEMAS = {}
 def get_schema(name):
    """Get the JSON schema for a given name. Looks for a .json file in
    spacy.cli.schemas, validates the schema and raises ValueError if not found.
    EXAMPLE:
        >>> schema = get_schema('training')
    name (unicode): The name of the schema.
    RETURNS (dict): The JSON schema.
    """
    if name not in SCHEMAS:
        schema_path = Path(__file__).parent / "{}.json".format(name)
        if not schema_path.exists():
            raise ValueError(Errors.E104.format(name=name))
        schema = read_json(schema_path)
        # TODO: replace with (stable) Draft6Validator, if available
        validator = Draft4Validator(schema)
        validator.check_schema(schema)
        SCHEMAS[name] = schema
    return SCHEMAS[name]
 def validate_json(data, schema):
    """Validate data against a given JSON schema (see https://json-schema.org).
    data: JSON-serializable data to validate.
    schema (dict): The JSON schema.
    RETURNS (list): A list of error messages, if available.
    """
    validator = Draft4Validator(schema)
    errors = []
    for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
        if err.path:
            err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
        else:
            err_path = ""
        errors.append(err.message + " " + err_path)
    return errors
--- a/spacy/cli/schemas/meta.json
+++ b/spacy/cli/schemas/meta.json
@ -0,0 +1,128 @@
 {
  "$schema": "http://json-schema.org/draft-06/schema",
  "type": "object",
  "properties": {
    "lang": {
      "title": "Two-letter language code, e.g. 'en'",
      "type": "string",
      "minLength": 2,
      "maxLength": 2,
      "pattern": "^[a-z]*$"
    },
    "name": {
      "title": "Model name",
      "type": "string",
      "minLength": 1,
      "pattern": "^[a-z_]*$"
    },
    "version": {
      "title": "Model version",
      "type": "string",
      "minLength": 1,
      "pattern": "^[0-9a-z.-]*$"
    },
    "spacy_version": {
      "title": "Compatible spaCy version identifier",
      "type": "string",
      "minLength": 1,
      "pattern": "^[0-9a-z.-><=]*$"
    },
    "parent_package": {
      "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
      "type": "string",
      "minLength": 1,
      "default": "spacy"
    },
    "pipeline": {
      "title": "Names of pipeline components",
      "type": "array",
      "items": {
        "type": "string",
        "minLength": 1
      }
    },
    "description": {
      "title": "Model description",
      "type": "string"
    },
    "license": {
      "title": "Model license",
      "type": "string"
    },
    "author": {
      "title": "Model author name",
      "type": "string"
    },
    "email": {
      "title": "Model author email",
      "type": "string",
      "format": "email"
    },
    "url": {
      "title": "Model author URL",
      "type": "string",
      "format": "uri"
    },
    "sources": {
      "title": "Training data sources",
      "type": "array",
      "items": {
        "type": "string"
      }
    },
    "vectors": {
      "title": "Included word vectors",
      "type": "object",
      "properties": {
        "keys": {
          "title": "Number of unique keys",
          "type": "integer",
          "minimum": 0
        },
        "vectors": {
          "title": "Number of unique vectors",
          "type": "integer",
          "minimum": 0
        },
        "width": {
          "title": "Number of dimensions",
          "type": "integer",
          "minimum": 0
        }
      }
    },
    "accuracy": {
      "title": "Accuracy numbers",
      "type": "object",
      "patternProperties": {
        "*": {
          "type": "number",
          "minimum": 0.0
        }
      }
    },
    "speed": {
      "title": "Speed evaluation numbers",
      "type": "object",
      "patternProperties": {
        "*": {
          "oneOf": [
            {
              "type": "number",
              "minimum": 0.0
            },
            {
              "type": "integer",
              "minimum": 0
            }
          ]
        }
      }
    }
  },
  "required": [
    "lang",
    "name",
    "version"
  ]
 }
--- a/spacy/cli/schemas/training.json
+++ b/spacy/cli/schemas/training.json
@ -0,0 +1,146 @@
 {
  "$schema": "http://json-schema.org/draft-06/schema",
  "title": "Training data for spaCy models",
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "text": {
        "title": "The text of the training example",
        "type": "string",
        "minLength": 1
      },
      "ents": {
        "title": "Named entity spans in the text",
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "start": {
              "title": "Start character offset of the span",
              "type": "integer",
              "minimum": 0
            },
            "end": {
              "title": "End character offset of the span",
              "type": "integer",
              "minimum": 0
            },
            "label": {
              "title": "Entity label",
              "type": "string",
              "minLength": 1,
              "pattern": "^[A-Z0-9]*$"
            }
          },
          "required": [
            "start",
            "end",
            "label"
          ]
        }
      },
      "sents": {
        "title": "Sentence spans in the text",
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "start": {
              "title": "Start character offset of the span",
              "type": "integer",
              "minimum": 0
            },
            "end": {
              "title": "End character offset of the span",
              "type": "integer",
              "minimum": 0
            }
          },
          "required": [
            "start",
            "end"
          ]
        }
      },
      "cats": {
        "title": "Text categories for the text classifier",
        "type": "object",
        "patternProperties": {
          "*": {
            "title": "A text category",
            "oneOf": [
              {
                "type": "boolean"
              },
              {
                "type": "number",
                "minimum": 0
              }
            ]
          }
        },
        "propertyNames": {
          "pattern": "^[A-Z0-9]*$",
          "minLength": 1
        }
      },
      "tokens": {
        "title": "The tokens in the text",
        "type": "array",
        "items": {
          "type": "object",
          "minProperties": 1,
          "properties": {
            "id": {
              "title": "Token ID, usually token index",
              "type": "integer",
              "minimum": 0
            },
            "start": {
              "title": "Start character offset of the token",
              "type": "integer",
              "minimum": 0
            },
            "end": {
              "title": "End character offset of the token",
              "type": "integer",
              "minimum": 0
            },
            "pos": {
              "title": "Coarse-grained part-of-speech tag",
              "type": "string",
              "minLength": 1
            },
            "tag": {
              "title": "Fine-grained part-of-speech tag",
              "type": "string",
              "minLength": 1
            },
            "dep": {
              "title": "Dependency label",
              "type": "string",
              "minLength": 1
            },
            "head": {
              "title": "Index of the token's head",
              "type": "integer",
              "minimum": 0
            }
          },
          "required": [
            "start",
            "end"
          ]
        }
      },
      "_": {
        "title": "Custom user space",
        "type": "object"
      }
    },
    "required": [
      "text"
    ]
  }
 }
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -6,213 +6,296 @@ from pathlib import Path
 import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer
 import json
 import shutil
 from wasabi import Printer
 from ._messages import Messages
 from .._ml import create_default_optimizer
 from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus
 from ..util import prints, minibatch, minibatch_by_words
 from .. import util
 from .. import about
-from .. import displacy
+
-from ..compat import json_dumps
+
 # Take dropout and batch size as generators of values -- dropout
 # starts high and decays sharply, to force the optimizer to explore.
 # Batch size starts at 1 and grows, so that we make updates quickly
 # at the beginning of training.
 dropout_rates = util.decaying(
    util.env_opt("dropout_from", 0.2),
    util.env_opt("dropout_to", 0.2),
    util.env_opt("dropout_decay", 0.0),
 )
 batch_sizes = util.compounding(
    util.env_opt("batch_from", 1000),
    util.env_opt("batch_to", 1000),
    util.env_opt("batch_compound", 1.001),
 )
@plac.annotations(
-    lang=("model language", "positional", None, str),
+    lang=("Model language", "positional", None, str),
-    output_dir=("output directory to store model in", "positional", None, str),
+    output_path=("Output directory to store model in", "positional", None, Path),
-    train_data=("location of JSON-formatted training data", "positional",
+    train_path=("Location of JSON-formatted training data", "positional", None, Path),
-                None, str),
+    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
-    dev_data=("location of JSON-formatted development data (optional)",
+    base_model=("Name of model to update (optional)", "option", "b", str),
-              "positional", None, str),
+    pipeline=("Comma-separated names of pipeline components", "option", "p", str),
-    n_iter=("number of iterations", "option", "n", int),
+    vectors=("Model to load vectors from", "option", "v", str),
-    n_sents=("number of sentences", "option", "ns", int),
+    n_iter=("Number of iterations", "option", "n", int),
    n_examples=("Number of examples", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
    vectors=("Model to load vectors from", "option", "v"),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
    parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
    noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
    entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    version=("Model version", "option", "V", str),
-    meta_path=("Optional path to meta.json. All relevant properties will be "
+    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
-               "overwritten.", "option", "m", Path),
+    init_tok2vec=(
-    init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
+        "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
-        "of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
+        "option",
-    verbose=("Display more information for debug", "option", None, bool))
+        "t2v",
-def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
+        Path,
-         parser_multitasks='', entity_multitasks='', init_tok2vec=None,
+    ),
-          use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
+    parser_multitasks=(
-          no_parser=False, no_entities=False, gold_preproc=False,
+        "Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
-          version="0.0.0", meta_path=None, verbose=False):
+        "option",
        "pt",
        str,
    ),
    entity_multitasks=(
        "Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
        "option",
        "et",
        str,
    ),
    noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
    verbose=("Display more information for debug", "flag", "VV", bool),
    debug=("Run data diagnostics before training", "flag", "D", bool),
 )
 def train(
    lang,
    output_path,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
 ):
    """
-    Train a model. Expects data in spaCy's JSON format.
+    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
-    util.set_env_log(True)
+    util.set_env_log(verbose)
-    n_sents = n_sents or None
+
-    output_path = util.ensure_path(output_dir)
+    # Make sure all files and paths exists if they are needed
-    train_path = util.ensure_path(train_data)
+    train_path = util.ensure_path(train_path)
-    dev_path = util.ensure_path(dev_data)
+    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
-    if not train_path.exists():
+    if not train_path or not train_path.exists():
-        prints(train_path, title=Messages.M050, exits=1)
+        msg.fail(Messages.M050, train_path, exits=1)
-    if dev_path and not dev_path.exists():
+    if not dev_path or not dev_path.exists():
-        prints(dev_path, title=Messages.M051, exits=1)
+        msg.fail(Messages.M051, dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
-        prints(meta_path, title=Messages.M020, exits=1)
+        msg.fail(Messages.M020, meta_path, exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
-        prints(Messages.M053.format(meta_type=type(meta)),
+        msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
-               title=Messages.M052, exits=1)
+    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
-    meta.setdefault('lang', lang)
+        msg.fail(Messages.M062, Messages.M065)
    meta.setdefault('name', 'unnamed')
    if not output_path.exists():
        output_path.mkdir()
-    print("Counting training words (limit=%s" % n_sents)
+    # Set up the base model and pipeline. If a base model is specified, load
-    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
+    # the model and make sure the pipeline matches the pipeline setting. If
-    n_train_words = corpus.count_train()
+    # training starts from a blank model, intitalize the language class.
-    print(n_train_words)
+    pipeline = [p.strip() for p in pipeline.split(",")]
-    pipeline = ['tagger', 'parser', 'ner']
+    msg.text(Messages.M055.format(pipeline=pipeline))
-    if no_tagger and 'tagger' in pipeline:
+    if base_model:
-        pipeline.remove('tagger')
+        msg.text(Messages.M056.format(model=base_model))
-    if no_parser and 'parser' in pipeline:
+        nlp = util.load_model(base_model)
-        pipeline.remove('parser')
+        if nlp.lang != lang:
-    if no_entities and 'ner' in pipeline:
+            msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
-        pipeline.remove('ner')
+        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe(pipe))
    else:
        msg.text(Messages.M057.format(model=lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            nlp.add_pipe(nlp.create_pipe(pipe))
    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
-    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
+    dropout_rates = util.decaying(
-                                  util.env_opt('dropout_to', 0.1),
+        util.env_opt("dropout_from", 0.1),
-                                  util.env_opt('dropout_decay', 0.0))
+        util.env_opt("dropout_to", 0.1),
-    batch_sizes = util.compounding(util.env_opt('batch_from', 750),
+        util.env_opt("dropout_decay", 0.0),
-                                   util.env_opt('batch_to', 750),
+    )
-                                   util.env_opt('batch_compound', 1.001))
+    batch_sizes = util.compounding(
        util.env_opt("batch_from", 750),
        util.env_opt("batch_to", 750),
        util.env_opt("batch_compound", 1.001),
    )
    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
-    meta['pipeline'] = pipeline
+    meta["pipeline"] = pipeline
    nlp.meta.update(meta)
    if vectors:
-        print("Load vectors model", vectors)
+        msg.text(Messages.M058.format(model=vectors))
-        util.load_model(vectors, vocab=nlp.vocab)
+        _load_vectors(nlp, vectors)
-        for lex in nlp.vocab:
+
-            values = {}
+    # Multitask objectives
-            for attr, func in nlp.vocab.lex_attr_getters.items():
+    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
-                # These attrs are expected to be set by data. Others should
+    for pipe_name, multitasks in multitask_options:
-                # be set by calling the language functions.
+        if multitasks:
-                if attr not in (CLUSTER, PROB, IS_OOV, LANG):
+            if pipe_name not in pipeline:
-                    values[lex.vocab.strings[attr]] = func(lex.orth_)
+                msg.fail(Messages.M059.format(pipe=pipe_name))
-            lex.set_attrs(**values)
+            pipe = nlp.get_pipe(pipe_name)
-            lex.is_oov = False
+            for objective in multitasks.split(","):
-    for name in pipeline:
+                pipe.add_multitask_objective(objective)
-        nlp.add_pipe(nlp.create_pipe(name), name=name)
+
-    nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
+    # Prepare training corpus
-    if parser_multitasks:
+    msg.text(Messages.M060.format(limit=n_examples))
-        for objective in parser_multitasks.split(','):
+    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
-            nlp.parser.add_multitask_objective(objective)
+    n_train_words = corpus.count_train()
-    if entity_multitasks:
+
-        for objective in entity_multitasks.split(','):
+    if base_model:
-            nlp.entity.add_multitask_objective(objective)
+        # Start with an existing model, use default optimizer
-    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+        optimizer = create_default_optimizer(Model.ops)
-    if init_tok2vec is not None:
+    else:
-        loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
+        # Start with a blank model, call begin_training
-        print("Loaded pretrained tok2vec for:", loaded)
+        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None
-    print("Itn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS")
+    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text(Messages.M071.format(components=components))
    print(
        "\nItn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS"
    )
    try:
        for i in range(n_iter):
-            train_docs = corpus.train_docs(nlp, noise_level=noise_level,
+            train_docs = corpus.train_docs(
-                                           gold_preproc=gold_preproc, max_length=0)
+                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
            )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
-                for batch in minibatch_by_words(train_docs, size=batch_sizes):
+                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
-                    nlp.update(docs, golds, sgd=optimizer,
+                    nlp.update(
-                               drop=next(dropout_rates), losses=losses)
+                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
-                epoch_model_path = output_path / ('model%d' % i)
+                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
-                dev_docs = list(corpus.dev_docs(
+                dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                                nlp_loaded,
                                gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
-                scorer = nlp_loaded.evaluate(dev_docs, verbose)
+                scorer = nlp_loaded.evaluate(dev_docs, debug)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
-                    cpu_wps = nwords/(end_time-start_time)
+                    cpu_wps = nwords / (end_time - start_time)
                else:
-                    gpu_wps = nwords/(end_time-start_time)
+                    gpu_wps = nwords / (end_time - start_time)
-                    with Model.use_device('cpu'):
+                    with Model.use_device("cpu"):
                        nlp_loaded = util.load_model_from_path(epoch_model_path)
-                        dev_docs = list(corpus.dev_docs(
+                        dev_docs = list(
-                                        nlp_loaded, gold_preproc=gold_preproc))
+                            corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                        )
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs)
                        end_time = timer()
-                        cpu_wps = nwords/(end_time-start_time)
+                        cpu_wps = nwords / (end_time - start_time)
-                acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
+                acc_loc = output_path / ("model%d" % i) / "accuracy.json"
-                with acc_loc.open('w') as file_:
+                util.write_json(acc_loc, scorer.scores)
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
                                 'gpu': gpu_wps}
                meta['vectors'] = {'width': nlp.vocab.vectors_length,
                                   'vectors': len(nlp.vocab.vectors),
                                   'keys': nlp.vocab.vectors.n_keys}
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', version)
-                with meta_loc.open('w') as file_:
+                # Update model meta.json
-                    file_.write(json_dumps(meta))
+                meta["lang"] = nlp.lang
-                util.set_env_log(True)
+                meta["pipeline"] = nlp.pipe_names
-            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
+                meta["spacy_version"] = ">=%s" % about.__version__
-                           gpu_wps=gpu_wps)
+                meta["accuracy"] = scorer.scores
                meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
                meta["vectors"] = {
                    "width": nlp.vocab.vectors_length,
                    "vectors": len(nlp.vocab.vectors),
                    "keys": nlp.vocab.vectors.n_keys,
                }
                meta.setdefault("name", "model%d" % i)
                meta.setdefault("version", version)
                meta_loc = output_path / ("model%d" % i) / "meta.json"
                util.write_json(meta_loc, meta)
                util.set_env_log(verbose)
            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
    finally:
-        print("Saving model...")
+        with msg.loading(Messages.M061):
-        with nlp.use_params(optimizer.averages):
+            with nlp.use_params(optimizer.averages):
-            final_model_path = output_path / 'model-final'
+                final_model_path = output_path / "model-final"
-            nlp.to_disk(final_model_path)
+                nlp.to_disk(final_model_path)
-    components = []
+        msg.good(Messages.M066, util.path2str(final_model_path))
-    if not no_parser:
+
-        components.append('parser')
+    _collate_best_model(meta, output_path, nlp.pipe_names)
-    if not no_tagger:
+
-        components.append('tagger')
+
-    if not no_entities:
+def _load_vectors(nlp, vectors):
-        components.append('ner')
+    util.load_model(vectors, vocab=nlp.vocab)
-    _collate_best_model(meta, output_path, components)
+    for lex in nlp.vocab:
        values = {}
        for attr, func in nlp.vocab.lex_attr_getters.items():
            # These attrs are expected to be set by data. Others should
            # be set by calling the language functions.
            if attr not in (CLUSTER, PROB, IS_OOV, LANG):
                values[lex.vocab.strings[attr]] = func(lex.orth_)
        lex.set_attrs(**values)
        lex.is_oov = False
 def _load_pretrained_tok2vec(nlp, loc):
    """Load pre-trained weights for the 'token-to-vector' part of the component
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
    """
-    with loc.open('rb') as file_:
+    with loc.open("rb") as file_:
        weights_data = file_.read()
    loaded = []
    for name, component in nlp.pipeline:
-        if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
+        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
            component.tok2vec.from_bytes(weights_data)
            loaded.append(name)
    return loaded
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
-    best_dest = output_path / 'model-best'
+    best_dest = output_path / "model-best"
-    shutil.copytree(output_path / 'model-final', best_dest)
+    shutil.copytree(output_path / "model-final", best_dest)
    for component, best_component_src in bests.items():
        shutil.rmtree(best_dest / component)
        shutil.copytree(best_component_src / component, best_dest / component)
-        with (best_component_src / 'accuracy.json').open() as file_:
+        accs = util.read_json(best_component_src / "accuracy.json")
            accs = json.load(file_)
        for metric in _get_metrics(component):
-            meta['accuracy'][metric] = accs[metric]
+            meta["accuracy"][metric] = accs[metric]
-    with (best_dest / 'meta.json').open('w') as file_:
+    util.write_json(best_dest / "meta.json", meta)
        file_.write(json_dumps(meta))
 def _find_best(experiment_dir, component):
    accuracies = []
    for epoch_model in experiment_dir.iterdir():
        if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
-            accs = json.load((epoch_model / "accuracy.json").open())
+            accs = util.read_json(epoch_model / "accuracy.json")
            scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
            accuracies.append((scores, epoch_model))
    if accuracies:
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
    else:
        return None
 def _get_metrics(component):
    if component == "parser":
        return ("las", "uas", "token_acc")
@ -257,50 +339,40 @@ def _get_metrics(component):
    return ("token_acc",)
 def _render_parses(i, to_render):
    to_render[0].user_data['title'] = "Batch %d" % i
    with Path('/tmp/entities.html').open('w') as file_:
        html = displacy.render(to_render[:5], style='ent', page=True)
        file_.write(html)
    with Path('/tmp/parses.html').open('w') as file_:
        html = displacy.render(to_render[:5], style='dep', page=True)
        file_.write(html)
 def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
    scores = {}
-    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
+    for col in [
-                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
+        "dep_loss",
        "tag_loss",
        "uas",
        "tags_acc",
        "token_acc",
        "ents_p",
        "ents_r",
        "ents_f",
        "cpu_wps",
        "gpu_wps",
    ]:
        scores[col] = 0.0
-    scores['dep_loss'] = losses.get('parser', 0.0)
+    scores["dep_loss"] = losses.get("parser", 0.0)
-    scores['ner_loss'] = losses.get('ner', 0.0)
+    scores["ner_loss"] = losses.get("ner", 0.0)
-    scores['tag_loss'] = losses.get('tagger', 0.0)
+    scores["tag_loss"] = losses.get("tagger", 0.0)
    scores.update(dev_scores)
-    scores['cpu_wps'] = cpu_wps
+    scores["cpu_wps"] = cpu_wps
-    scores['gpu_wps'] = gpu_wps or 0.0
+    scores["gpu_wps"] = gpu_wps or 0.0
-    tpl = ''.join((
+    tpl = "".join(
-        '{:<6d}',
+        (
-        '{dep_loss:<10.3f}',
+            "{:<6d}",
-        '{ner_loss:<10.3f}',
+            "{dep_loss:<10.3f}",
-        '{uas:<8.3f}',
+            "{ner_loss:<10.3f}",
-        '{ents_p:<8.3f}',
+            "{uas:<8.3f}",
-        '{ents_r:<8.3f}',
+            "{ents_p:<8.3f}",
-        '{ents_f:<8.3f}',
+            "{ents_r:<8.3f}",
-        '{tags_acc:<8.3f}',
+            "{ents_f:<8.3f}",
-        '{token_acc:<9.3f}',
+            "{tags_acc:<8.3f}",
-        '{cpu_wps:<9.1f}',
+            "{token_acc:<9.3f}",
-        '{gpu_wps:.1f}',
+            "{cpu_wps:<9.1f}",
-    ))
+            "{gpu_wps:.1f}",
        )
    )
    print(tpl.format(itn, **scores))
 def print_results(scorer):
    results = {
        'TOK': '%.2f' % scorer.token_acc,
        'POS': '%.2f' % scorer.tags_acc,
        'UAS': '%.2f' % scorer.uas,
        'LAS': '%.2f' % scorer.las,
        'NER P': '%.2f' % scorer.ents_p,
        'NER R': '%.2f' % scorer.ents_r,
        'NER F': '%.2f' % scorer.ents_f}
    util.print_table(results, title="Results")
--- a/spacy/cli/ud/init.py
+++ b/spacy/cli/ud/init.py
@ -0,0 +1,2 @@
 from .conll17_ud_eval import main as ud_evaluate  # noqa: F401
 from .ud_train import main as ud_train  # noqa: F401
--- a/spacy/cli/ud/conll17_ud_eval.py
+++ b/spacy/cli/ud/conll17_ud_eval.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
 # flake8: noqa
 # CoNLL 2017 UD Parsing evaluation script.
 #
@ -214,7 +215,7 @@ def load_conllu(file):
                start, end = map(int, columns[ID].split("-"))
            except:
                raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
-            
+
            for _ in range(start, end + 1):
                word_line = file.readline().rstrip("\r\n")
                word_columns = word_line.split("\t")
--- a/spacy/cli/ud/ud_run_test.py
+++ b/spacy/cli/ud/ud_run_test.py
@ -1,7 +1,9 @@
-'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
+# flake8: noqa
 """Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
 .conllu format for development data, allowing the official scorer to be used.
-'''
+"""
 from __future__ import unicode_literals
 import plac
 import tqdm
 from pathlib import Path
@ -11,15 +13,17 @@ import json
 import spacy
 import spacy.util
-from ..tokens import Token, Doc
+from ...tokens import Token, Doc
-from ..gold import GoldParse
+from ...gold import GoldParse
-from ..util import compounding, minibatch_by_words
+from ...util import compounding, minibatch_by_words
-from ..syntax.nonproj import projectivize
+from ...syntax.nonproj import projectivize
-from ..matcher import Matcher
+from ...matcher import Matcher
-#from ..morphology import Fused_begin, Fused_inside
+
-from .. import displacy
+# from ...morphology import Fused_begin, Fused_inside
 from ... import displacy
 from collections import defaultdict, Counter
 from timeit import default_timer as timer
 Fused_begin = None
 Fused_inside = None
@ -30,43 +34,45 @@ import cytoolz
 from . import conll17_ud_eval
-from .. import lang
+from ... import lang
-from .. import lang
+from ...lang import zh
-from ..lang import zh
+from ...lang import ja
-from ..lang import ja
+from ...lang import ru
 from ..lang import ru
 ################
 # Data reading #
 ################
-space_re = re.compile('\s+')
+space_re = re.compile("\s+")
 def split_text(text):
-    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
+    return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
- 
+
 ##############
 # Evaluation #
 ##############
 def read_conllu(file_):
    docs = []
    sent = []
    doc = []
    for line in file_:
-        if line.startswith('# newdoc'):
+        if line.startswith("# newdoc"):
            if doc:
                docs.append(doc)
            doc = []
-        elif line.startswith('#'):
+        elif line.startswith("#"):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
-            sent.append(list(line.strip().split('\t')))
+            sent.append(list(line.strip().split("\t")))
            if len(sent[-1]) != 10:
                print(repr(line))
                raise ValueError
@ -78,7 +84,7 @@ def read_conllu(file_):
 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
-    if text_loc.parts[-1].endswith('.conllu'):
+    if text_loc.parts[-1].endswith(".conllu"):
        docs = []
        with text_loc.open() as file_:
            for conllu_doc in read_conllu(file_):
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
        for name, component in nlp.pipeline:
            docs = list(component.pipe(docs))
    else:
-        with text_loc.open('r', encoding='utf8') as text_file:
+        with text_loc.open("r", encoding="utf8") as text_file:
            texts = split_text(text_file.read())
            docs = list(nlp.pipe(texts))
-    with sys_loc.open('w', encoding='utf8') as out_file:
+    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
-    with gold_loc.open('r', encoding='utf8') as gold_file:
+    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
-        with sys_loc.open('r', encoding='utf8') as sys_file:
+        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return docs, scores
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
 def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
-    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
+    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
-        spans = [doc[start:end+1] for _, start, end in matches]
+        spans = [doc[start : end + 1] for _, start, end in matches]
        offsets = [(span.start_char, span.end_char) for span in spans]
        for start_char, end_char in offsets:
            doc.merge(start_char, end_char)
        # TODO: This shuldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
-                word.dep_ = 'ROOT'
+                word.dep_ = "ROOT"
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
-                file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
+                file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
-            file_.write('\n')
+            file_.write("\n")
            for word in sent:
-                if word.head.i == word.i and word.dep_ == 'ROOT':
+                if word.head.i == word.i and word.dep_ == "ROOT":
                    break
            else:
                print("Rootless sentence!")
@ -134,24 +140,34 @@ def write_conllu(docs, file_):
 def _get_token_conllu(token, k, sent_len):
-    if token.check_morph(Fused_begin) and (k+1 < sent_len):
+    if token.check_morph(Fused_begin) and (k + 1 < sent_len):
        n = 1
        text = [token.text]
        while token.nbor(n).check_morph(Fused_inside):
            text.append(token.nbor(n).text)
            n += 1
-        id_ = '%d-%d' % (k+1, (k+n))
+        id_ = "%d-%d" % (k + 1, (k + n))
-        fields = [id_, ''.join(text)] + ['_'] * 8
+        fields = [id_, "".join(text)] + ["_"] * 8
-        lines = ['\t'.join(fields)]
+        lines = ["\t".join(fields)]
    else:
        lines = []
    if token.head.i == token.i:
        head = 0
    else:
        head = k + (token.head.i - token.i) + 1
-    fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
+    fields = [
-              str(head), token.dep_.lower(), '_', '_']
+        str(k + 1),
-    if token.check_morph(Fused_begin) and (k+1 < sent_len):
+        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        "_",
        str(head),
        token.dep_.lower(),
        "_",
        "_",
    ]
    if token.check_morph(Fused_begin) and (k + 1 < sent_len):
        if k == 0:
            fields[1] = token.norm_[0].upper() + token.norm_[1:]
        else:
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
        split_end = token._.split_end
        split_len = (split_end.i - split_start.i) + 1
        n_in_split = token.i - split_start.i
-        subtokens = guess_fused_orths(split_start.text, [''] * split_len)
+        subtokens = guess_fused_orths(split_start.text, [""] * split_len)
        fields[1] = subtokens[n_in_split]
-    lines.append('\t'.join(fields))
+    lines.append("\t".join(fields))
-    return '\n'.join(lines)
+    return "\n".join(lines)
 def guess_fused_orths(word, ud_forms):
-    '''The UD data 'fused tokens' don't necessarily expand to keys that match
+    """The UD data 'fused tokens' don't necessarily expand to keys that match
    the form. We need orths that exact match the string. Here we make a best
-    effort to divide up the word.'''
+    effort to divide up the word."""
-    if word == ''.join(ud_forms):
+    if word == "".join(ud_forms):
        # Happy case: we get a perfect split, with each letter accounted for.
        return ud_forms
    elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
        remain = word
        for subtoken in ud_forms:
            assert len(subtoken) >= 1
-            output.append(remain[:len(subtoken)])
+            output.append(remain[: len(subtoken)])
-            remain = remain[len(subtoken):]
+            remain = remain[len(subtoken) :]
        assert len(remain) == 0, (word, ud_forms, remain)
        return output
    else:
        # Let's say word is 6 long, and there are three subtokens. The orths
        # *must* equal the original string. Arbitrarily, split [4, 1, 1]
-        first = word[:len(word)-(len(ud_forms)-1)]
+        first = word[: len(word) - (len(ud_forms) - 1)]
        output = [first]
-        remain = word[len(first):]
+        remain = word[len(first) :]
        for i in range(1, len(ud_forms)):
            assert remain
            output.append(remain[:1])
@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms):
        return output
 def print_results(name, ud_scores):
    fields = {}
    if ud_scores is not None:
-        fields.update({
+        fields.update(
-            'words': ud_scores['Words'].f1 * 100,
+            {
-            'sents': ud_scores['Sentences'].f1 * 100,
+                "words": ud_scores["Words"].f1 * 100,
-            'tags': ud_scores['XPOS'].f1 * 100,
+                "sents": ud_scores["Sentences"].f1 * 100,
-            'uas': ud_scores['UAS'].f1 * 100,
+                "tags": ud_scores["XPOS"].f1 * 100,
-            'las': ud_scores['LAS'].f1 * 100,
+                "uas": ud_scores["UAS"].f1 * 100,
-        })
+                "las": ud_scores["LAS"].f1 * 100,
            }
        )
    else:
-        fields.update({
+        fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
-            'words': 0.0,
+    tpl = "\t".join(
-            'sents': 0.0,
+        (name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
-            'tags': 0.0,
+    )
            'uas': 0.0,
            'las': 0.0
        })
    tpl = '\t'.join((
        name,
        '{las:.1f}',
        '{uas:.1f}',
        '{tags:.1f}',
        '{sents:.1f}',
        '{words:.1f}',
    ))
    print(tpl.format(**fields))
    return fields
 def get_token_split_start(token):
-    if token.text == '':
+    if token.text == "":
        assert token.i != 0
        i = -1
-        while token.nbor(i).text == '':
+        while token.nbor(i).text == "":
            i -= 1
        return token.nbor(i)
-    elif (token.i+1) < len(token.doc) and token.nbor(1).text == '':
+    elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
        return token
    else:
        return None
 def get_token_split_end(token):
-    if (token.i+1) == len(token.doc):
+    if (token.i + 1) == len(token.doc):
-        return token if token.text == '' else None
+        return token if token.text == "" else None
-    elif token.text != '' and token.nbor(1).text != '':
+    elif token.text != "" and token.nbor(1).text != "":
        return None
    i = 1
-    while (token.i+i) < len(token.doc) and token.nbor(i).text == '':
+    while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
        i += 1
-    return token.nbor(i-1)
+    return token.nbor(i - 1)
- 
+
 ##################
 # Initialization #
@ -262,54 +268,73 @@ def get_token_split_end(token):
 def load_nlp(experiments_dir, corpus):
-    nlp = spacy.load(experiments_dir / corpus / 'best-model')
+    nlp = spacy.load(experiments_dir / corpus / "best-model")
    return nlp
 def initialize_pipeline(nlp, docs, golds, config, device):
-    nlp.add_pipe(nlp.create_pipe('parser'))
+    nlp.add_pipe(nlp.create_pipe("parser"))
    return nlp
@plac.annotations(
-    test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path),
+    test_data_dir=(
        "Path to Universal Dependencies test data",
        "positional",
        None,
        Path,
    ),
    experiment_dir=("Parent directory with output model", "positional", None, Path),
-    corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
+    corpus=(
        "UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
        "positional",
        None,
        str,
    ),
 )
 def main(test_data_dir, experiment_dir, corpus):
-    Token.set_extension('split_start', getter=get_token_split_start)
+    Token.set_extension("split_start", getter=get_token_split_start)
-    Token.set_extension('split_end', getter=get_token_split_end)
+    Token.set_extension("split_end", getter=get_token_split_end)
-    Token.set_extension('begins_fused', default=False)
+    Token.set_extension("begins_fused", default=False)
-    Token.set_extension('inside_fused', default=False)
+    Token.set_extension("inside_fused", default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False
    nlp = load_nlp(experiment_dir, corpus)
    treebank_code = nlp.meta['treebank']
    for section in ('test', 'dev'):
        if section == 'dev':
            section_dir = 'conll17-ud-development-2017-03-19'
        else:
            section_dir = 'conll17-ud-test-2017-05-09'
        text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt')
        udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu')
        gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu')
-        header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
+    treebank_code = nlp.meta["treebank"]
-        print('\t'.join(header))
+    for section in ("test", "dev"):
-        inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path}
+        if section == "dev":
-        for input_type in ('udp', 'raw'):
+            section_dir = "conll17-ud-development-2017-03-19"
        else:
            section_dir = "conll17-ud-test-2017-05-09"
        text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
        udpipe_path = (
            test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
        )
        gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
        header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
        print("\t".join(header))
        inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
        for input_type in ("udp", "raw"):
            input_path = inputs[input_type]
-            output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section)
+            output_path = (
                experiment_dir / corpus / "{section}.conllu".format(section=section)
            )
            parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
            accuracy = print_results(input_type, test_scores)
-            acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section)
+            acc_path = (
-            with open(acc_path, 'w') as file_:
+                experiment_dir
                / corpus
                / "{section}-accuracy.json".format(section=section)
            )
            with open(acc_path, "w") as file_:
                file_.write(json.dumps(accuracy, indent=2))
-if __name__ == '__main__':
+if __name__ == "__main__":
    plac.call(main)
--- a/spacy/cli/ud/ud_train.py
+++ b/spacy/cli/ud/ud_train.py
@ -1,7 +1,9 @@
-'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
+# flake8: noqa
 """Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
 .conllu format for development data, allowing the official scorer to be used.
-'''
+"""
 from __future__ import unicode_literals
 import plac
 import tqdm
 from pathlib import Path
@ -11,12 +13,12 @@ import json
 import spacy
 import spacy.util
-from ..tokens import Token, Doc
+from ...tokens import Token, Doc
-from ..gold import GoldParse
+from ...gold import GoldParse
-from ..util import compounding, minibatch, minibatch_by_words
+from ...util import compounding, minibatch, minibatch_by_words
-from ..syntax.nonproj import projectivize
+from ...syntax.nonproj import projectivize
-from ..matcher import Matcher
+from ...matcher import Matcher
-from .. import displacy
+from ... import displacy
 from collections import defaultdict, Counter
 from timeit import default_timer as timer
@ -27,10 +29,9 @@ import cytoolz
 from . import conll17_ud_eval
-from .. import lang
+from ... import lang
-from .. import lang
+from ...lang import zh
-from ..lang import zh
+from ...lang import ja
 from ..lang import ja
 try:
    import torch
@ -42,17 +43,26 @@ except ImportError:
 # Data reading #
 ################
-space_re = re.compile('\s+')
+space_re = re.compile("\s+")
 def split_text(text):
    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
-def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
+
-              max_doc_length=None, limit=None):
+def split_text(text):
-    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+    return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
 def read_data(
    nlp,
    conllu_file,
    text_file,
    raw_text=True,
    oracle_segments=False,
    max_doc_length=None,
    limit=None,
 ):
    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
-    created from the gold-standard segments. At least one must be True.'''
+    created from the gold-standard segments. At least one must be True."""
    if not raw_text and not oracle_segments:
        raise ValueError("At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
-                if '.' in id_:
+                if "." in id_:
                    continue
-                if '-' in id_:
+                if "-" in id_:
                    continue
-                id_ = int(id_)-1
+                id_ = int(id_) - 1
-                head = int(head)-1 if head != '0' else id_
+                head = int(head) - 1 if head != "0" else id_
-                sent['words'].append(word)
+                sent["words"].append(word)
-                sent['tags'].append(tag)
+                sent["tags"].append(tag)
-                sent['heads'].append(head)
+                sent["heads"].append(head)
-                sent['deps'].append('ROOT' if dep == 'root' else dep)
+                sent["deps"].append("ROOT" if dep == "root" else dep)
-                sent['spaces'].append(space_after == '_')
+                sent["spaces"].append(space_after == "_")
-            sent['entities'] = ['-'] * len(sent['words'])
+            sent["entities"] = ["-"] * len(sent["words"])
-            sent['heads'], sent['deps'] = projectivize(sent['heads'],
+            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
                                                       sent['deps'])
            if oracle_segments:
-                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
+                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                golds.append(GoldParse(docs[-1], **sent))
            sent_annots.append(sent)
@ -107,18 +116,18 @@ def read_conllu(file_):
    sent = []
    doc = []
    for line in file_:
-        if line.startswith('# newdoc'):
+        if line.startswith("# newdoc"):
            if doc:
                docs.append(doc)
            doc = []
-        elif line.startswith('#'):
+        elif line.startswith("#"):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
-            sent.append(list(line.strip().split('\t')))
+            sent.append(list(line.strip().split("\t")))
            if len(sent[-1]) != 10:
                print(repr(line))
                raise ValueError
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    flat = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
-        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
+        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
-        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
+        for field in ["words", "tags", "deps", "entities", "spaces"]:
            flat[field].extend(sent[field])
        sent_starts.append(True)
-        sent_starts.extend([False] * (len(sent['words'])-1))
+        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
-    assert len(flat['words']) == len(flat['spaces'])
+    assert len(flat["words"]) == len(flat["spaces"])
    if text is None:
-        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
+        text = "".join(
            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
        )
    doc = nlp.make_doc(text)
-    flat.pop('spaces')
+    flat.pop("spaces")
    gold = GoldParse(doc, **flat)
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    return doc, gold
 #############################
 # Data transforms for spaCy #
 #############################
 def golds_to_gold_tuples(docs, golds):
-    '''Get out the annoying 'tuples' format used by begin_training, given the
+    """Get out the annoying 'tuples' format used by begin_training, given the
-    GoldParse objects.'''
+    GoldParse objects."""
    tuples = []
    for doc, gold in zip(docs, golds):
        text = doc.text
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
 # Evaluation #
 ##############
 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
-    if text_loc.parts[-1].endswith('.conllu'):
+    if text_loc.parts[-1].endswith(".conllu"):
        docs = []
        with text_loc.open() as file_:
            for conllu_doc in read_conllu(file_):
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
        for name, component in nlp.pipeline:
            docs = list(component.pipe(docs))
    else:
-        with text_loc.open('r', encoding='utf8') as text_file:
+        with text_loc.open("r", encoding="utf8") as text_file:
            texts = split_text(text_file.read())
            docs = list(nlp.pipe(texts))
-    with sys_loc.open('w', encoding='utf8') as out_file:
+    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
-    with gold_loc.open('r', encoding='utf8') as gold_file:
+    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
-        with sys_loc.open('r', encoding='utf8') as sys_file:
+        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return docs, scores
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
 def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
-    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
+    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
-        spans = [doc[start:end+1] for _, start, end in matches]
+        spans = [doc[start : end + 1] for _, start, end in matches]
        offsets = [(span.start_char, span.end_char) for span in spans]
        for start_char, end_char in offsets:
            doc.merge(start_char, end_char)
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
-                    for word in doc[sent[0].i-10 : sent[0].i]:
+                    for word in doc[sent[0].i - 10 : sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
-                    for word in doc[sent[-1].i : sent[-1].i+10]:
+                    for word in doc[sent[-1].i : sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
-                    raise ValueError("Invalid parse: head outside sentence (%s)" % token.text)
+                    raise ValueError(
-                file_.write(token._.get_conllu_lines(k) + '\n')
+                        "Invalid parse: head outside sentence (%s)" % token.text
-            file_.write('\n')
+                    )
                file_.write(token._.get_conllu_lines(k) + "\n")
            file_.write("\n")
 def print_progress(itn, losses, ud_scores):
    fields = {
-        'dep_loss': losses.get('parser', 0.0),
+        "dep_loss": losses.get("parser", 0.0),
-        'tag_loss': losses.get('tagger', 0.0),
+        "tag_loss": losses.get("tagger", 0.0),
-        'words': ud_scores['Words'].f1 * 100,
+        "words": ud_scores["Words"].f1 * 100,
-        'sents': ud_scores['Sentences'].f1 * 100,
+        "sents": ud_scores["Sentences"].f1 * 100,
-        'tags': ud_scores['XPOS'].f1 * 100,
+        "tags": ud_scores["XPOS"].f1 * 100,
-        'uas': ud_scores['UAS'].f1 * 100,
+        "uas": ud_scores["UAS"].f1 * 100,
-        'las': ud_scores['LAS'].f1 * 100,
+        "las": ud_scores["LAS"].f1 * 100,
    }
-    header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
+    header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
    if itn == 0:
-        print('\t'.join(header))
+        print("\t".join(header))
-    tpl = '\t'.join((
+    tpl = "\t".join(
-        '{:d}',
+        (
-        '{dep_loss:.1f}',
+            "{:d}",
-        '{las:.1f}',
+            "{dep_loss:.1f}",
-        '{uas:.1f}',
+            "{las:.1f}",
-        '{tags:.1f}',
+            "{uas:.1f}",
-        '{sents:.1f}',
+            "{tags:.1f}",
-        '{words:.1f}',
+            "{sents:.1f}",
-    ))
+            "{words:.1f}",
        )
    )
    print(tpl.format(itn, **fields))
-#def get_sent_conllu(sent, sent_id):
+
 # def get_sent_conllu(sent, sent_id):
 #    lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
 def get_token_conllu(token, i):
    if token._.begins_fused:
        n = 1
        while token.nbor(n)._.inside_fused:
            n += 1
-        id_ = '%d-%d' % (i, i+n)
+        id_ = "%d-%d" % (i, i + n)
-        lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
+        lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
    else:
        lines = []
    if token.head.i == token.i:
        head = 0
    else:
        head = i + (token.head.i - token.i) + 1
-    fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
+    fields = [
-              str(head), token.dep_.lower(), '_', '_']
+        str(i + 1),
-    lines.append('\t'.join(fields))
+        token.text,
-    return '\n'.join(lines)
+        token.lemma_,
        token.pos_,
        token.tag_,
        "_",
        str(head),
        token.dep_.lower(),
        "_",
        "_",
    ]
    lines.append("\t".join(fields))
    return "\n".join(lines)
-Token.set_extension('get_conllu_lines', method=get_token_conllu)
+
-Token.set_extension('begins_fused', default=False)
+Token.set_extension("get_conllu_lines", method=get_token_conllu)
-Token.set_extension('inside_fused', default=False)
+Token.set_extension("begins_fused", default=False)
 Token.set_extension("inside_fused", default=False)
 ##################
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)
 def load_nlp(corpus, config, vectors=None):
-    lang = corpus.split('_')[0]
+    lang = corpus.split("_")[0]
    nlp = spacy.blank(lang)
    if config.vectors:
-        if not vectors:     
+        if not vectors:
-            raise ValueError("config asks for vectors, but no vectors "
+            raise ValueError(
-                             "directory set on command line (use -v)")
+                "config asks for vectors, but no vectors "
                "directory set on command line (use -v)"
            )
        if (Path(vectors) / corpus).exists():
-            nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab')
+            nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
-    nlp.meta['treebank'] = corpus
+    nlp.meta["treebank"] = corpus
    return nlp
-                                                                            
+
 def initialize_pipeline(nlp, docs, golds, config, device):
-    nlp.add_pipe(nlp.create_pipe('tagger'))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
-    nlp.add_pipe(nlp.create_pipe('parser'))
+    nlp.add_pipe(nlp.create_pipe("parser"))
    if config.multitask_tag:
-        nlp.parser.add_multitask_objective('tag')
+        nlp.parser.add_multitask_objective("tag")
    if config.multitask_sent:
-        nlp.parser.add_multitask_objective('sent_start')
+        nlp.parser.add_multitask_objective("sent_start")
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    if torch is not None and device != -1:
-        torch.set_default_tensor_type('torch.cuda.FloatTensor')
+        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    optimizer = nlp.begin_training(
-        lambda: golds_to_gold_tuples(docs, golds), device=device,
+        lambda: golds_to_gold_tuples(docs, golds),
-        subword_features=config.subword_features, conv_depth=config.conv_depth,
+        device=device,
-        bilstm_depth=config.bilstm_depth)
+        subword_features=config.subword_features,
        conv_depth=config.conv_depth,
        bilstm_depth=config.bilstm_depth,
    )
    if config.pretrained_tok2vec:
        _load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
    return optimizer
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
    """Load pre-trained weights for the 'token-to-vector' part of the component
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
    """
-    with Path(loc).open('rb') as file_:
+    with Path(loc).open("rb") as file_:
        weights_data = file_.read()
    loaded = []
    for name, component in nlp.pipeline:
-        if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
+        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
            component.tok2vec.from_bytes(weights_data)
            loaded.append(name)
    return loaded
 ########################
 # Command line helpers #
 ########################
 class Config(object):
-    def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False,
+    def __init__(
-                multitask_sent=False, multitask_dep=False, multitask_vectors=None,
+        self,
-                bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750,
+        vectors=None,
-                batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True,
+        max_doc_length=10,
-                vectors_dir=None, pretrained_tok2vec=None):
+        multitask_tag=False,
        multitask_sent=False,
        multitask_dep=False,
        multitask_vectors=None,
        bilstm_depth=0,
        nr_epoch=30,
        min_batch_size=100,
        max_batch_size=1000,
        batch_by_words=True,
        dropout=0.2,
        conv_depth=4,
        subword_features=True,
        vectors_dir=None,
        pretrained_tok2vec=None,
    ):
        if vectors_dir is not None:
            if vectors is None:
                vectors = True
@ -346,13 +396,13 @@ class Config(object):
                multitask_vectors = True
        for key, value in locals().items():
            setattr(self, key, value)
-    
+
    @classmethod
    def load(cls, loc, vectors_dir=None):
-        with Path(loc).open('r', encoding='utf8') as file_:
+        with Path(loc).open("r", encoding="utf8") as file_:
            cfg = json.load(file_)
        if vectors_dir is not None:
-            cfg['vectors_dir'] = vectors_dir
+            cfg["vectors_dir"] = vectors_dir
        return cls(**cfg)
@ -364,43 +414,59 @@ class Dataset(object):
        self.text = None
        for file_path in self.path.iterdir():
            name = file_path.parts[-1]
-            if section in name and name.endswith('conllu'):
+            if section in name and name.endswith("conllu"):
                self.conllu = file_path
-            elif section in name and name.endswith('txt'):
+            elif section in name and name.endswith("txt"):
                self.text = file_path
        if self.conllu is None:
            msg = "Could not find .txt file in {path} for {section}"
            raise IOError(msg.format(section=section, path=path))
        if self.text is None:
            msg = "Could not find .txt file in {path} for {section}"
-        self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
+        self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
 class TreebankPaths(object):
    def __init__(self, ud_path, treebank, **cfg):
-        self.train = Dataset(ud_path / treebank, 'train')
+        self.train = Dataset(ud_path / treebank, "train")
-        self.dev = Dataset(ud_path / treebank, 'dev')
+        self.dev = Dataset(ud_path / treebank, "dev")
        self.lang = self.train.lang
@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
-    corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+    corpus=(
-            "positional", None, str),
+        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
        "positional",
        None,
        str,
    ),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "option", "C", Path),
    limit=("Size limit", "option", "n", int),
    gpu_device=("Use GPU", "option", "g", int),
    use_oracle_segments=("Use oracle segments", "flag", "G", int),
-    vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
+    vectors_dir=(
-                         "option", "v", Path),
+        "Path to directory with pre-trained vectors, named e.g. en/",
        "option",
        "v",
        Path,
    ),
 )
-def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None,
+def main(
-        use_oracle_segments=False):
+    ud_dir,
    parses_dir,
    corpus,
    config=None,
    limit=0,
    gpu_device=-1,
    vectors_dir=None,
    use_oracle_segments=False,
 ):
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
-   
+
    if config is not None:
        config = Config.load(config, vectors_dir=vectors_dir)
    else:
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
-    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
+    docs, golds = read_data(
-                                        max_doc_length=config.max_doc_length,
+        nlp,
-                                        limit=limit)
+        paths.train.conllu.open(),
        paths.train.text.open(),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )
    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
-        docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
+        docs, golds = read_data(
-                                max_doc_length=config.max_doc_length, limit=limit,
+            nlp,
-                                oracle_segments=use_oracle_segments,
+            paths.train.conllu.open(),
-                                raw_text=not use_oracle_segments)
+            paths.train.text.open(),
            max_doc_length=config.max_doc_length,
            limit=limit,
            oracle_segments=use_oracle_segments,
            raw_text=not use_oracle_segments,
        )
        Xs = list(zip(docs, golds))
        random.shuffle(Xs)
        if config.batch_by_words:
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
-                nlp.parser.cfg['beam_update_prob'] = next(beam_prob)
+                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
-                nlp.update(batch_docs, batch_gold, sgd=optimizer,
+                nlp.update(
-                           drop=config.dropout, losses=losses)
+                    batch_docs,
-        
+                    batch_gold,
-        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
+                    sgd=optimizer,
                    drop=config.dropout,
                    losses=losses,
                )
        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
        with nlp.use_params(optimizer.averages):
            if use_oracle_segments:
-                parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
+                parsed_docs, scores = evaluate(
-                                               paths.dev.conllu, out_path)
+                    nlp, paths.dev.conllu, paths.dev.conllu, out_path
                )
            else:
-                parsed_docs, scores = evaluate(nlp, paths.dev.text,
+                parsed_docs, scores = evaluate(
-                                               paths.dev.conllu, out_path)
+                    nlp, paths.dev.text, paths.dev.conllu, out_path
                )
            print_progress(i, losses, scores)
 def _render_parses(i, to_render):
-    to_render[0].user_data['title'] = "Batch %d" % i
+    to_render[0].user_data["title"] = "Batch %d" % i
-    with Path('/tmp/parses.html').open('w') as file_:
+    with Path("/tmp/parses.html").open("w") as file_:
-        html = displacy.render(to_render[:5], style='dep', page=True)
+        html = displacy.render(to_render[:5], style="dep", page=True)
        file_.write(html)
-if __name__ == '__main__':
+if __name__ == "__main__":
    plac.call(main)
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
 import pkg_resources
 from pathlib import Path
 import sys
 import ujson
 import requests
 from wasabi import Printer
 from ._messages import Messages
-from ..compat import path2str, locale_escape
+from ..compat import path2str
-from ..util import prints, get_data_path, read_json
+from ..util import get_data_path, read_json
 from .. import about
 def validate():
-    """Validate that the currently installed version of spaCy is compatible
+    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
-    r = requests.get(about.__compatibility__)
+    msg = Printer()
-    if r.status_code != 200:
+    with msg.loading("Loading compatibility table..."):
-        prints(Messages.M021, title=Messages.M003.format(code=r.status_code),
+        r = requests.get(about.__compatibility__)
-               exits=1)
+        if r.status_code != 200:
-    compat = r.json()['spacy']
+            msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    current_compat = compat.get(about.__version__)
    if not current_compat:
-        prints(about.__compatibility__, exits=1,
+        msg.fail(
-               title=Messages.M022.format(version=about.__version__))
+            Messages.M022.format(version=about.__version__),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
@ -33,33 +39,38 @@ def validate():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
-    incompat_links = {l for l, d in model_links.items() if not d['compat']}
+    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
-    incompat_models = {d['name'] for _, d in model_pkgs.items()
+    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
-                       if not d['compat']}
+    incompat_models.update(
-    incompat_models.update([d['name'] for _, d in model_links.items()
+        [d["name"] for _, d in model_links.items() if not d["compat"]]
-                            if not d['compat']])
+    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent
    msg.divider(Messages.M023.format(version=about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
    prints(path2str(Path(__file__).parent.parent),
           title=Messages.M023.format(version=about.__version__))
    if model_links or model_pkgs:
-        print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
+        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
-            print(get_model_row(current_compat, name, data, 'package'))
+            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
-            print(get_model_row(current_compat, name, data, 'link'))
+            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
-        prints(Messages.M024, exits=0)
+        msg.text(Messages.M024, exits=0)
    if update_models:
-        cmd = '    python -m spacy download {}'
+        msg.divider("Install updates")
-        print("\n    " + Messages.M025)
+        cmd = "python -m spacy download {}"
-        print('\n'.join([cmd.format(pkg) for pkg in update_models]))
+        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
-        prints(Messages.M025.format(version=about.__version__,
+        msg.text(
-                                    models=', '.join(na_models)))
+            Messages.M025.format(version=about.__version__, models=", ".join(na_models))
        )
    if incompat_links:
-        prints(Messages.M027.format(path=path2str(get_data_path())))
+        msg.text(Messages.M027.format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)
@ -70,50 +81,48 @@ def get_model_links(compat):
    if data_path:
        models = [p for p in data_path.iterdir() if is_model_path(p)]
        for model in models:
-            meta_path = Path(model) / 'meta.json'
+            meta_path = Path(model) / "meta.json"
            if not meta_path.exists():
                continue
            meta = read_json(meta_path)
            link = model.parts[-1]
-            name = meta['lang'] + '_' + meta['name']
+            name = meta["lang"] + "_" + meta["name"]
-            links[link] = {'name': name, 'version': meta['version'],
+            links[link] = {
-                           'compat': is_compat(compat, name, meta['version'])}
+                "name": name,
                "version": meta["version"],
                "compat": is_compat(compat, name, meta["version"]),
            }
    return links
 def get_model_pkgs(compat, all_models):
    pkgs = {}
    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
-        package = pkg_name.replace('-', '_')
+        package = pkg_name.replace("-", "_")
        if package in all_models:
            version = pkg_data.version
-            pkgs[pkg_name] = {'name': package, 'version': version,
+            pkgs[pkg_name] = {
-                              'compat': is_compat(compat, package, version)}
+                "name": package,
                "version": version,
                "compat": is_compat(compat, package, version),
            }
    return pkgs
-def get_model_row(compat, name, data, type='package'):
+def get_model_row(compat, name, data, msg, model_type="package"):
-    tpl_red = '\x1b[38;5;1m{}\x1b[0m'
+    if data["compat"]:
-    tpl_green = '\x1b[38;5;2m{}\x1b[0m'
+        comp = msg.text("", color="green", icon="good", no_print=True)
-    if data['compat']:
+        version = msg.text(data["version"], color="green", no_print=True)
        comp = tpl_green.format(locale_escape('✔', errors='ignore'))
        version = tpl_green.format(data['version'])
    else:
-        comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
+        version = msg.text(data["version"], color="red", no_print=True)
-        version = tpl_red.format(data['version'])
+        comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
-    return get_row(type, name, data['name'], version, comp)
+    return (model_type, name, data["name"], version, comp)
 def get_row(*args):
    tpl_row = '    {:<10}' + ('  {:<20}' * 4)
    return tpl_row.format(*args)
 def is_model_path(model_path):
-    exclude = ['cache', 'pycache', '__pycache__']
+    exclude = ["cache", "pycache", "__pycache__"]
    name = model_path.parts[-1]
-    return (model_path.is_dir() and name not in exclude
+    return model_path.is_dir() and name not in exclude and not name.startswith(".")
            and not name.startswith('.'))
 def is_compat(compat, name, version):
@ -122,6 +131,6 @@ def is_compat(compat, name, version):
 def reformat_version(version):
    """Hack to reformat old versions ending on '-alpha' to match pip format."""
-    if version.endswith('-alpha'):
+    if version.endswith("-alpha"):
-        return version.replace('-alpha', 'a0')
+        return version.replace("-alpha", "a0")
-    return version.replace('-alpha', 'a')
+    return version.replace("-alpha", "a")
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@ -1,59 +0,0 @@
 # coding: utf8
 from __future__ import unicode_literals
 import plac
 import json
 import spacy
 import numpy
 from pathlib import Path
 from ..vectors import Vectors
 from ..util import prints, ensure_path
@plac.annotations(
    lang=("model language", "positional", None, str),
    output_dir=("model output directory", "positional", None, Path),
    lexemes_loc=("location of JSONL-formatted lexical data", "positional",
                 None, Path),
    vectors_loc=("optional: location of vectors data, as numpy .npz",
                 "positional", None, str),
    prune_vectors=("optional: number of vectors to prune to.",
                   "option", "V", int)
 )
 def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
    if not lexemes_loc.exists():
        prints(lexemes_loc, title="Can't find lexical data", exits=1)
    vectors_loc = ensure_path(vectors_loc)
    nlp = spacy.blank(lang)
    for word in nlp.vocab:
        word.rank = 0
    lex_added = 0
    with lexemes_loc.open() as file_:
        for line in file_:
            if line.strip():
                attrs = json.loads(line)
                if 'settings' in attrs:
                    nlp.vocab.cfg.update(attrs['settings'])
                else:
                    lex = nlp.vocab[attrs['orth']]
                    lex.set_attrs(**attrs)
                    assert lex.rank == attrs['id']
                lex_added += 1
    if vectors_loc is not None:
        vector_data = numpy.load(vectors_loc.open('rb'))
        nlp.vocab.vectors = Vectors(data=vector_data)
        for word in nlp.vocab:
            if word.rank:
                nlp.vocab.vectors.add(word.orth, row=word.rank)
    if prune_vectors >= 1:
        remap = nlp.vocab.prune_vectors(prune_vectors)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    vec_added = len(nlp.vocab.vectors)
    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
           title="Sucessfully compiled vocab and vectors, and saved model")
    return nlp
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,11 +1,10 @@
 # coding: utf8
 from __future__ import unicode_literals
 import os
 import sys
 import ujson
 import itertools
 import locale
 import os
 from thinc.neural.util import copy_array
@ -30,9 +29,9 @@ except ImportError:
    cupy = None
 try:
-    from thinc.neural.optimizers import Optimizer
+    from thinc.neural.optimizers import Optimizer  # noqa: F401
 except ImportError:
-    from thinc.neural.optimizers import Adam as Optimizer
+    from thinc.neural.optimizers import Adam as Optimizer  # noqa: F401
 pickle = pickle
 copy_reg = copy_reg
@ -136,12 +135,3 @@ def import_file(name, loc):
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
 def locale_escape(string, errors="replace"):
    """
    Mangle non-supported characters, for savages with ascii terminals.
    """
    encoding = locale.getpreferredencoding()
    string = string.encode(encoding, errors).decode("utf8")
    return string
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -5,15 +5,22 @@ from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc, Span
 from ..compat import b_to_str
 from ..errors import Errors, Warnings, user_warning
-from ..util import prints, is_in_jupyter
+from ..util import is_in_jupyter
 _html = {}
 IS_JUPYTER = is_in_jupyter()
-def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
+def render(
-           options={}, manual=False):
+    docs,
    style="dep",
    page=False,
    minify=False,
    jupyter=IS_JUPYTER,
    options={},
    manual=False,
 ):
    """Render displaCy visualisation.
    docs (list or Doc): Document(s) to visualise.
@ -25,8 +32,10 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    RETURNS (unicode): Rendered HTML markup.
    """
-    factories = {'dep': (DependencyRenderer, parse_deps),
+    factories = {
-                 'ent': (EntityRenderer, parse_ents)}
+        "dep": (DependencyRenderer, parse_deps),
        "ent": (EntityRenderer, parse_ents),
    }
    if style not in factories:
        raise ValueError(Errors.E087.format(style=style))
    if isinstance(docs, (Doc, Span, dict)):
@ -37,16 +46,18 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
    renderer, converter = factories[style]
    renderer = renderer(options=options)
    parsed = [converter(doc, options) for doc in docs] if not manual else docs
-    _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
+    _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
-    html = _html['parsed']
+    html = _html["parsed"]
    if jupyter:  # return HTML rendered by IPython display()
        from IPython.core.display import display, HTML
        return display(HTML(html))
    return html
-def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
+def serve(
-          port=5000):
+    docs, style="dep", page=True, minify=False, options={}, manual=False, port=5000
 ):
    """Serve displaCy visualisation.
    docs (list or Doc): Document(s) to visualise.
@ -58,25 +69,24 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
    port (int): Port to serve visualisation.
    """
    from wsgiref import simple_server
-    render(docs, style=style, page=page, minify=minify, options=options,
+
-           manual=manual)
+    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
-    httpd = simple_server.make_server('0.0.0.0', port, app)
+    httpd = simple_server.make_server("0.0.0.0", port, app)
-    prints("Using the '{}' visualizer".format(style),
+    print("\nUsing the '{}' visualizer".format(style))
-           title="Serving on port {}...".format(port))
+    print("Serving on port {}...\n".format(port))
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
-        prints("Shutting down server on port {}.".format(port))
+        print("Shutting down server on port {}.".format(port))
    finally:
        httpd.server_close()
 def app(environ, start_response):
-    # headers and status need to be bytes in Python 2, see #1227
+    # Headers and status need to be bytes in Python 2, see #1227
-    headers = [(b_to_str(b'Content-type'),
+    headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
-                b_to_str(b'text/html; charset=utf-8'))]
+    start_response(b_to_str(b"200 OK"), headers)
-    start_response(b_to_str(b'200 OK'), headers)
+    res = _html["parsed"].encode(encoding="utf-8")
    res = _html['parsed'].encode(encoding='utf-8')
    return [res]
@ -89,11 +99,10 @@ def parse_deps(orig_doc, options={}):
    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
    if not doc.is_parsed:
        user_warning(Warnings.W005)
-    if options.get('collapse_phrases', False):
+    if options.get("collapse_phrases", False):
        for np in list(doc.noun_chunks):
-            np.merge(tag=np.root.tag_, lemma=np.root.lemma_,
+            np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
-                    ent_type=np.root.ent_type_)
+    if options.get("collapse_punct", True):
    if options.get('collapse_punct', True):
        spans = []
        for word in doc[:-1]:
            if word.is_punct or not word.nbor(1).is_punct:
@ -103,23 +112,31 @@ def parse_deps(orig_doc, options={}):
            while end < len(doc) and doc[end].is_punct:
                end += 1
            span = doc[start:end]
-            spans.append((span.start_char, span.end_char, word.tag_,
+            spans.append(
-                          word.lemma_, word.ent_type_))
+                (span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
            )
        for start, end, tag, lemma, ent_type in spans:
            doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
-    if options.get('fine_grained'):
+    if options.get("fine_grained"):
-        words = [{'text': w.text, 'tag': w.tag_} for w in doc]
+        words = [{"text": w.text, "tag": w.tag_} for w in doc]
    else:
-        words = [{'text': w.text, 'tag': w.pos_} for w in doc]
+        words = [{"text": w.text, "tag": w.pos_} for w in doc]
    arcs = []
    for word in doc:
        if word.i < word.head.i:
-            arcs.append({'start': word.i, 'end': word.head.i,
+            arcs.append(
-                         'label': word.dep_, 'dir': 'left'})
+                {"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
            )
        elif word.i > word.head.i:
-            arcs.append({'start': word.head.i, 'end': word.i,
+            arcs.append(
-                         'label': word.dep_, 'dir': 'right'})
+                {
-    return {'words': words, 'arcs': arcs}
+                    "start": word.head.i,
                    "end": word.i,
                    "label": word.dep_,
                    "dir": "right",
                }
            )
    return {"words": words, "arcs": arcs}
 def parse_ents(doc, options={}):
@ -128,10 +145,11 @@ def parse_ents(doc, options={}):
    doc (Doc): Document do parse.
    RETURNS (dict): Generated entities keyed by text (original text) and ents.
    """
-    ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
+    ents = [
-            for ent in doc.ents]
+        {"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
        for ent in doc.ents
    ]
    if not ents:
        user_warning(Warnings.W006)
-    title = (doc.user_data.get('title', None)
+    title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
-             if hasattr(doc, 'user_data') else None)
+    return {"text": doc.text, "ents": ents, "title": title}
    return {'text': doc.text, 'ents': ents, 'title': title}
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,6 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 import random
 from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS
 from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE
 from ..util import minify_html, escape_html
@ -8,7 +10,8 @@ from ..util import minify_html, escape_html
 class DependencyRenderer(object):
    """Render dependency parses as SVGs."""
-    style = 'dep'
+
    style = "dep"
    def __init__(self, options={}):
        """Initialise dependency renderer.
@ -17,18 +20,16 @@ class DependencyRenderer(object):
            arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
            color, bg, font)
        """
-        self.compact = options.get('compact', False)
+        self.compact = options.get("compact", False)
-        self.word_spacing = options.get('word_spacing', 45)
+        self.word_spacing = options.get("word_spacing", 45)
-        self.arrow_spacing = options.get('arrow_spacing',
+        self.arrow_spacing = options.get("arrow_spacing", 12 if self.compact else 20)
-                                         12 if self.compact else 20)
+        self.arrow_width = options.get("arrow_width", 6 if self.compact else 10)
-        self.arrow_width = options.get('arrow_width',
+        self.arrow_stroke = options.get("arrow_stroke", 2)
-                                       6 if self.compact else 10)
+        self.distance = options.get("distance", 150 if self.compact else 175)
-        self.arrow_stroke = options.get('arrow_stroke', 2)
+        self.offset_x = options.get("offset_x", 50)
-        self.distance = options.get('distance', 150 if self.compact else 175)
+        self.color = options.get("color", "#000000")
-        self.offset_x = options.get('offset_x', 50)
+        self.bg = options.get("bg", "#ffffff")
-        self.color = options.get('color', '#000000')
+        self.font = options.get("font", "Arial")
        self.bg = options.get('bg', '#ffffff')
        self.font = options.get('font', 'Arial')
    def render(self, parsed, page=False, minify=False):
        """Render complete markup.
@ -38,14 +39,18 @@ class DependencyRenderer(object):
        minify (bool): Minify HTML markup.
        RETURNS (unicode): Rendered SVG or HTML markup.
        """
-        rendered = [self.render_svg(i, p['words'], p['arcs'])
+        # Create a random ID prefix to make sure parses don't receive the
-                    for i, p in enumerate(parsed)]
+        # same ID, even if they're identical
        id_prefix = random.randint(0, 999)
        rendered = [
            self.render_svg("{}-{}".format(id_prefix, i), p["words"], p["arcs"])
            for i, p in enumerate(parsed)
        ]
        if page:
-            content = ''.join([TPL_FIGURE.format(content=svg)
+            content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
                               for svg in rendered])
            markup = TPL_PAGE.format(content=content)
        else:
-            markup = ''.join(rendered)
+            markup = "".join(rendered)
        if minify:
            return minify_html(markup)
        return markup
@ -60,19 +65,25 @@ class DependencyRenderer(object):
        """
        self.levels = self.get_levels(arcs)
        self.highest_level = len(self.levels)
-        self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke
+        self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
-        self.width = self.offset_x+len(words)*self.distance
+        self.width = self.offset_x + len(words) * self.distance
-        self.height = self.offset_y+3*self.word_spacing
+        self.height = self.offset_y + 3 * self.word_spacing
        self.id = render_id
-        words = [self.render_word(w['text'], w['tag'], i)
+        words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)]
-                 for i, w in enumerate(words)]
+        arcs = [
-        arcs = [self.render_arrow(a['label'], a['start'],
+            self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
-                                  a['end'], a['dir'], i)
+            for i, a in enumerate(arcs)
-                for i, a in enumerate(arcs)]
+        ]
-        content = ''.join(words) + ''.join(arcs)
+        content = "".join(words) + "".join(arcs)
-        return TPL_DEP_SVG.format(id=self.id, width=self.width,
+        return TPL_DEP_SVG.format(
-                                  height=self.height, color=self.color,
+            id=self.id,
-                                  bg=self.bg, font=self.font, content=content)
+            width=self.width,
            height=self.height,
            color=self.color,
            bg=self.bg,
            font=self.font,
            content=content,
        )
    def render_word(self, text, tag, i):
        """Render individual word.
@ -82,12 +93,11 @@ class DependencyRenderer(object):
        i (int): Unique ID, typically word index.
        RETURNS (unicode): Rendered SVG markup.
        """
-        y = self.offset_y+self.word_spacing
+        y = self.offset_y + self.word_spacing
-        x = self.offset_x+i*self.distance
+        x = self.offset_x + i * self.distance
        html_text = escape_html(text)
        return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
    def render_arrow(self, label, start, end, direction, i):
        """Render indivicual arrow.
@ -98,20 +108,30 @@ class DependencyRenderer(object):
        i (int): Unique ID, typically arrow index.
        RETURNS (unicode): Rendered SVG markup.
        """
-        level = self.levels.index(end-start)+1
+        level = self.levels.index(end - start) + 1
-        x_start = self.offset_x+start*self.distance+self.arrow_spacing
+        x_start = self.offset_x + start * self.distance + self.arrow_spacing
        y = self.offset_y
-        x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
+        x_end = (
-                 - self.arrow_spacing*(self.highest_level-level)/4)
+            self.offset_x
-        y_curve = self.offset_y-level*self.distance/2
+            + (end - start) * self.distance
            + start * self.distance
            - self.arrow_spacing * (self.highest_level - level) / 4
        )
        y_curve = self.offset_y - level * self.distance / 2
        if self.compact:
-            y_curve = self.offset_y-level*self.distance/6
+            y_curve = self.offset_y - level * self.distance / 6
        if y_curve == 0 and len(self.levels) > 5:
            y_curve = -self.distance
        arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
        arc = self.get_arc(x_start, y, y_curve, x_end)
-        return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke,
+        return TPL_DEP_ARCS.format(
-                                   head=arrowhead, label=label, arc=arc)
+            id=self.id,
            i=i,
            stroke=self.arrow_stroke,
            head=arrowhead,
            label=label,
            arc=arc,
        )
    def get_arc(self, x_start, y, y_curve, x_end):
        """Render individual arc.
@ -136,13 +156,22 @@ class DependencyRenderer(object):
        end (int): X-coordinate of arrow end point.
        RETURNS (unicode): Definition of the arrow head path ('d' attribute).
        """
-        if direction == 'left':
+        if direction == "left":
-            pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
+            pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
        else:
-            pos1, pos2, pos3 = (end, end+self.arrow_width-2,
+            pos1, pos2, pos3 = (
-                                end-self.arrow_width+2)
+                end,
-        arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
+                end + self.arrow_width - 2,
-                     y-self.arrow_width)
+                end - self.arrow_width + 2,
            )
        arrowhead = (
            pos1,
            y + 2,
            pos2,
            y - self.arrow_width,
            pos3,
            y - self.arrow_width,
        )
        return "M{},{} L{},{} {},{}".format(*arrowhead)
    def get_levels(self, arcs):
@ -152,30 +181,44 @@ class DependencyRenderer(object):
        args (list): Individual arcs and their start, end, direction and label.
        RETURNS (list): Arc levels sorted from lowest to highest.
        """
-        levels = set(map(lambda arc: arc['end'] - arc['start'], arcs))
+        levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
        return sorted(list(levels))
 class EntityRenderer(object):
    """Render named entities as HTML."""
-    style = 'ent'
+
    style = "ent"
    def __init__(self, options={}):
        """Initialise dependency renderer.
        options (dict): Visualiser-specific options (colors, ents)
        """
-        colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
+        colors = {
-                  'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
+            "ORG": "#7aecec",
-                  'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
+            "PRODUCT": "#bfeeb7",
-                  'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
+            "GPE": "#feca74",
-                  'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
+            "LOC": "#ff9561",
-                  'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
+            "PERSON": "#aa9cfc",
-                  'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
+            "NORP": "#c887fb",
-        colors.update(options.get('colors', {}))
+            "FACILITY": "#9cc9cc",
-        self.default_color = '#ddd'
+            "EVENT": "#ffeb80",
            "LAW": "#ff8197",
            "LANGUAGE": "#ff8197",
            "WORK_OF_ART": "#f0d0ff",
            "DATE": "#bfe1d9",
            "TIME": "#bfe1d9",
            "MONEY": "#e4e7d2",
            "QUANTITY": "#e4e7d2",
            "ORDINAL": "#e4e7d2",
            "CARDINAL": "#e4e7d2",
            "PERCENT": "#e4e7d2",
        }
        colors.update(options.get("colors", {}))
        self.default_color = "#ddd"
        self.colors = colors
-        self.ents = options.get('ents', None)
+        self.ents = options.get("ents", None)
    def render(self, parsed, page=False, minify=False):
        """Render complete markup.
@ -185,14 +228,14 @@ class EntityRenderer(object):
        minify (bool): Minify HTML markup.
        RETURNS (unicode): Rendered HTML markup.
        """
-        rendered = [self.render_ents(p['text'], p['ents'],
+        rendered = [
-                    p.get('title', None)) for p in parsed]
+            self.render_ents(p["text"], p["ents"], p.get("title", None)) for p in parsed
        ]
        if page:
-            docs = ''.join([TPL_FIGURE.format(content=doc)
+            docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
                            for doc in rendered])
            markup = TPL_PAGE.format(content=docs)
        else:
-            markup = ''.join(rendered)
+            markup = "".join(rendered)
        if minify:
            return minify_html(markup)
        return markup
@ -204,18 +247,18 @@ class EntityRenderer(object):
        spans (list): Individual entity spans and their start, end and label.
        title (unicode or None): Document title set in Doc.user_data['title'].
        """
-        markup = ''
+        markup = ""
        offset = 0
        for span in spans:
-            label = span['label']
+            label = span["label"]
-            start = span['start']
+            start = span["start"]
-            end = span['end']
+            end = span["end"]
            entity = text[start:end]
-            fragments = text[offset:start].split('\n')
+            fragments = text[offset:start].split("\n")
            for i, fragment in enumerate(fragments):
                markup += fragment
-                if len(fragments) > 1 and i != len(fragments)-1:
+                if len(fragments) > 1 and i != len(fragments) - 1:
-                    markup += '</br>'
+                    markup += "</br>"
            if self.ents is None or label.upper() in self.ents:
                color = self.colors.get(label.upper(), self.default_color)
                markup += TPL_ENT.format(label=label, text=entity, bg=color)
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals
-# setting explicit height and max-width: none on the SVG is required for
+# Setting explicit height and max-width: none on the SVG is required for
 # Jupyter to render it properly in a cell
 TPL_DEP_SVG = """
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -8,13 +8,17 @@ import inspect
 def add_codes(err_cls):
    """Add error codes to string messages via class attribute names."""
    class ErrorsWithCodes(object):
        def __getattribute__(self, code):
            msg = getattr(err_cls, code)
-            return '[{code}] {msg}'.format(code=code, msg=msg)
+            return "[{code}] {msg}".format(code=code, msg=msg)
    return ErrorsWithCodes()
 # fmt: off
@add_codes
 class Warnings(object):
    W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
@ -260,7 +264,7 @@ class Errors(object):
    E095 = ("Can't write to frozen dictionary. This is likely an internal "
            "error. Are you writing to a default function argument?")
    E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
-             "Span objects, or dicts if set to manual=True.")
+            "Span objects, or dicts if set to manual=True.")
    E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
            "phrase pattern (string) but got:\n{pattern}")
    E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
@ -270,6 +274,19 @@ class Errors(object):
            "NBOR_RELOP.")
    E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
            "have been declared in previous edges.")
    E102 = ("Can't merge non-disjoint spans. '{token}' is already part of tokens to merge")
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
            " can only be part of one entity, so make sure the entities you're "
            "setting don't overlap.")
    E104 = ("Can't find JSON schema for '{name}'.")
    E105 = ("The Doc.print_tree() method is now deprecated. Please use "
            "Doc.json() instead.")
    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
            "settings: {opts}")
    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
    E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
            "in favor of the pipe name `sentencizer`, which does the same "
            "thing. For example, use `nlp.create_pipeline('sentencizer')`")
@add_codes
@ -289,55 +306,57 @@ class TempErrors(object):
            "(pretrained_dims) but not the new name (pretrained_vectors).")
 # fmt: on
 class ModelsWarning(UserWarning):
    pass
 WARNINGS = {
-    'user': UserWarning,
+    "user": UserWarning,
-    'deprecation': DeprecationWarning,
+    "deprecation": DeprecationWarning,
-    'models': ModelsWarning,
+    "models": ModelsWarning,
 }
 def _get_warn_types(arg):
-    if arg == '':  # don't show any warnings
+    if arg == "":  # don't show any warnings
        return []
-    if not arg or arg == 'all':  # show all available warnings
+    if not arg or arg == "all":  # show all available warnings
        return WARNINGS.keys()
-    return [w_type.strip() for w_type in arg.split(',')
+    return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
            if w_type.strip() in WARNINGS]
 def _get_warn_excl(arg):
    if not arg:
        return []
-    return [w_id.strip() for w_id in arg.split(',')]
+    return [w_id.strip() for w_id in arg.split(",")]
-SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER')
+SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
-SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
+SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
-SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE'))
+SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
 def user_warning(message):
-    _warn(message, 'user')
+    _warn(message, "user")
 def deprecation_warning(message):
-    _warn(message, 'deprecation')
+    _warn(message, "deprecation")
 def models_warning(message):
-    _warn(message, 'models')
+    _warn(message, "models")
-def _warn(message, warn_type='user'):
+def _warn(message, warn_type="user"):
    """
    message (unicode): The message to display.
    category (Warning): The Warning to show.
    """
-    w_id = message.split('[', 1)[1].split(']', 1)[0]  # get ID from string
+    w_id = message.split("[", 1)[1].split("]", 1)[0]  # get ID from string
    if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
        category = WARNINGS[warn_type]
        stack = inspect.stack()[-1]
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -21,294 +21,272 @@ GLOSSARY = {
    # POS tags
    # Universal POS Tags
    # http://universaldependencies.org/u/pos/
-
+    "ADJ": "adjective",
-    'ADJ':          'adjective',
+    "ADP": "adposition",
-    'ADP':          'adposition',
+    "ADV": "adverb",
-    'ADV':          'adverb',
+    "AUX": "auxiliary",
-    'AUX':          'auxiliary',
+    "CONJ": "conjunction",
-    'CONJ':         'conjunction',
+    "CCONJ": "coordinating conjunction",
-    'CCONJ':        'coordinating conjunction',
+    "DET": "determiner",
-    'DET':          'determiner',
+    "INTJ": "interjection",
-    'INTJ':         'interjection',
+    "NOUN": "noun",
-    'NOUN':         'noun',
+    "NUM": "numeral",
-    'NUM':          'numeral',
+    "PART": "particle",
-    'PART':         'particle',
+    "PRON": "pronoun",
-    'PRON':         'pronoun',
+    "PROPN": "proper noun",
-    'PROPN':        'proper noun',
+    "PUNCT": "punctuation",
-    'PUNCT':        'punctuation',
+    "SCONJ": "subordinating conjunction",
-    'SCONJ':        'subordinating conjunction',
+    "SYM": "symbol",
-    'SYM':          'symbol',
+    "VERB": "verb",
-    'VERB':         'verb',
+    "X": "other",
-    'X':            'other',
+    "EOL": "end of line",
-    'EOL':          'end of line',
+    "SPACE": "space",
    'SPACE':        'space',
    # POS tags (English)
    # OntoNotes 5 / Penn Treebank
    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
-
+    ".": "punctuation mark, sentence closer",
-    '.':            'punctuation mark, sentence closer',
+    ",": "punctuation mark, comma",
-    ',':            'punctuation mark, comma',
+    "-LRB-": "left round bracket",
-    '-LRB-':        'left round bracket',
+    "-RRB-": "right round bracket",
-    '-RRB-':        'right round bracket',
+    "``": "opening quotation mark",
-    '``':           'opening quotation mark',
+    '""': "closing quotation mark",
-    '""':           'closing quotation mark',
+    "''": "closing quotation mark",
-    "''":           'closing quotation mark',
+    ":": "punctuation mark, colon or ellipsis",
-    ':':            'punctuation mark, colon or ellipsis',
+    "$": "symbol, currency",
-    '$':            'symbol, currency',
+    "#": "symbol, number sign",
-    '#':            'symbol, number sign',
+    "AFX": "affix",
-    'AFX':          'affix',
+    "CC": "conjunction, coordinating",
-    'CC':           'conjunction, coordinating',
+    "CD": "cardinal number",
-    'CD':           'cardinal number',
+    "DT": "determiner",
-    'DT':           'determiner',
+    "EX": "existential there",
-    'EX':           'existential there',
+    "FW": "foreign word",
-    'FW':           'foreign word',
+    "HYPH": "punctuation mark, hyphen",
-    'HYPH':         'punctuation mark, hyphen',
+    "IN": "conjunction, subordinating or preposition",
-    'IN':           'conjunction, subordinating or preposition',
+    "JJ": "adjective",
-    'JJ':           'adjective',
+    "JJR": "adjective, comparative",
-    'JJR':          'adjective, comparative',
+    "JJS": "adjective, superlative",
-    'JJS':          'adjective, superlative',
+    "LS": "list item marker",
-    'LS':           'list item marker',
+    "MD": "verb, modal auxiliary",
-    'MD':           'verb, modal auxiliary',
+    "NIL": "missing tag",
-    'NIL':          'missing tag',
+    "NN": "noun, singular or mass",
-    'NN':           'noun, singular or mass',
+    "NNP": "noun, proper singular",
-    'NNP':          'noun, proper singular',
+    "NNPS": "noun, proper plural",
-    'NNPS':         'noun, proper plural',
+    "NNS": "noun, plural",
-    'NNS':          'noun, plural',
+    "PDT": "predeterminer",
-    'PDT':          'predeterminer',
+    "POS": "possessive ending",
-    'POS':          'possessive ending',
+    "PRP": "pronoun, personal",
-    'PRP':          'pronoun, personal',
+    "PRP$": "pronoun, possessive",
-    'PRP$':         'pronoun, possessive',
+    "RB": "adverb",
-    'RB':           'adverb',
+    "RBR": "adverb, comparative",
-    'RBR':          'adverb, comparative',
+    "RBS": "adverb, superlative",
-    'RBS':          'adverb, superlative',
+    "RP": "adverb, particle",
-    'RP':           'adverb, particle',
+    "TO": "infinitival to",
-    'TO':           'infinitival to',
+    "UH": "interjection",
-    'UH':           'interjection',
+    "VB": "verb, base form",
-    'VB':           'verb, base form',
+    "VBD": "verb, past tense",
-    'VBD':          'verb, past tense',
+    "VBG": "verb, gerund or present participle",
-    'VBG':          'verb, gerund or present participle',
+    "VBN": "verb, past participle",
-    'VBN':          'verb, past participle',
+    "VBP": "verb, non-3rd person singular present",
-    'VBP':          'verb, non-3rd person singular present',
+    "VBZ": "verb, 3rd person singular present",
-    'VBZ':          'verb, 3rd person singular present',
+    "WDT": "wh-determiner",
-    'WDT':          'wh-determiner',
+    "WP": "wh-pronoun, personal",
-    'WP':           'wh-pronoun, personal',
+    "WP$": "wh-pronoun, possessive",
-    'WP$':          'wh-pronoun, possessive',
+    "WRB": "wh-adverb",
-    'WRB':          'wh-adverb',
+    "SP": "space",
-    'SP':           'space',
+    "ADD": "email",
-    'ADD':          'email',
+    "NFP": "superfluous punctuation",
-    'NFP':          'superfluous punctuation',
+    "GW": "additional word in multi-word expression",
-    'GW':           'additional word in multi-word expression',
+    "XX": "unknown",
-    'XX':           'unknown',
+    "BES": 'auxiliary "be"',
-    'BES':          'auxiliary "be"',
+    "HVS": 'forms of "have"',
    'HVS':          'forms of "have"',
    # POS Tags (German)
    # TIGER Treebank
    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
-
+    "$(": "other sentence-internal punctuation mark",
-    '$(':           'other sentence-internal punctuation mark',
+    "$,": "comma",
-    '$,':           'comma',
+    "$.": "sentence-final punctuation mark",
-    '$.':           'sentence-final punctuation mark',
+    "ADJA": "adjective, attributive",
-    'ADJA':         'adjective, attributive',
+    "ADJD": "adjective, adverbial or predicative",
-    'ADJD':         'adjective, adverbial or predicative',
+    "APPO": "postposition",
-    'APPO':         'postposition',
+    "APPR": "preposition; circumposition left",
-    'APPR':         'preposition; circumposition left',
+    "APPRART": "preposition with article",
-    'APPRART':      'preposition with article',
+    "APZR": "circumposition right",
-    'APZR':         'circumposition right',
+    "ART": "definite or indefinite article",
-    'ART':          'definite or indefinite article',
+    "CARD": "cardinal number",
-    'CARD':         'cardinal number',
+    "FM": "foreign language material",
-    'FM':           'foreign language material',
+    "ITJ": "interjection",
-    'ITJ':          'interjection',
+    "KOKOM": "comparative conjunction",
-    'KOKOM':        'comparative conjunction',
+    "KON": "coordinate conjunction",
-    'KON':          'coordinate conjunction',
+    "KOUI": 'subordinate conjunction with "zu" and infinitive',
-    'KOUI':         'subordinate conjunction with "zu" and infinitive',
+    "KOUS": "subordinate conjunction with sentence",
-    'KOUS':         'subordinate conjunction with sentence',
+    "NE": "proper noun",
-    'NE':           'proper noun',
+    "NNE": "proper noun",
-    'NNE':          'proper noun',
+    "PAV": "pronominal adverb",
-    'PAV':          'pronominal adverb',
+    "PROAV": "pronominal adverb",
-    'PROAV':        'pronominal adverb',
+    "PDAT": "attributive demonstrative pronoun",
-    'PDAT':         'attributive demonstrative pronoun',
+    "PDS": "substituting demonstrative pronoun",
-    'PDS':          'substituting demonstrative pronoun',
+    "PIAT": "attributive indefinite pronoun without determiner",
-    'PIAT':         'attributive indefinite pronoun without determiner',
+    "PIDAT": "attributive indefinite pronoun with determiner",
-    'PIDAT':        'attributive indefinite pronoun with determiner',
+    "PIS": "substituting indefinite pronoun",
-    'PIS':          'substituting indefinite pronoun',
+    "PPER": "non-reflexive personal pronoun",
-    'PPER':         'non-reflexive personal pronoun',
+    "PPOSAT": "attributive possessive pronoun",
-    'PPOSAT':       'attributive possessive pronoun',
+    "PPOSS": "substituting possessive pronoun",
-    'PPOSS':        'substituting possessive pronoun',
+    "PRELAT": "attributive relative pronoun",
-    'PRELAT':       'attributive relative pronoun',
+    "PRELS": "substituting relative pronoun",
-    'PRELS':        'substituting relative pronoun',
+    "PRF": "reflexive personal pronoun",
-    'PRF':          'reflexive personal pronoun',
+    "PTKA": "particle with adjective or adverb",
-    'PTKA':         'particle with adjective or adverb',
+    "PTKANT": "answer particle",
-    'PTKANT':       'answer particle',
+    "PTKNEG": "negative particle",
-    'PTKNEG':       'negative particle',
+    "PTKVZ": "separable verbal particle",
-    'PTKVZ':        'separable verbal particle',
+    "PTKZU": '"zu" before infinitive',
-    'PTKZU':        '"zu" before infinitive',
+    "PWAT": "attributive interrogative pronoun",
-    'PWAT':         'attributive interrogative pronoun',
+    "PWAV": "adverbial interrogative or relative pronoun",
-    'PWAV':         'adverbial interrogative or relative pronoun',
+    "PWS": "substituting interrogative pronoun",
-    'PWS':          'substituting interrogative pronoun',
+    "TRUNC": "word remnant",
-    'TRUNC':        'word remnant',
+    "VAFIN": "finite verb, auxiliary",
-    'VAFIN':        'finite verb, auxiliary',
+    "VAIMP": "imperative, auxiliary",
-    'VAIMP':        'imperative, auxiliary',
+    "VAINF": "infinitive, auxiliary",
-    'VAINF':        'infinitive, auxiliary',
+    "VAPP": "perfect participle, auxiliary",
-    'VAPP':         'perfect participle, auxiliary',
+    "VMFIN": "finite verb, modal",
-    'VMFIN':        'finite verb, modal',
+    "VMINF": "infinitive, modal",
-    'VMINF':        'infinitive, modal',
+    "VMPP": "perfect participle, modal",
-    'VMPP':         'perfect participle, modal',
+    "VVFIN": "finite verb, full",
-    'VVFIN':        'finite verb, full',
+    "VVIMP": "imperative, full",
-    'VVIMP':        'imperative, full',
+    "VVINF": "infinitive, full",
-    'VVINF':        'infinitive, full',
+    "VVIZU": 'infinitive with "zu", full',
-    'VVIZU':        'infinitive with "zu", full',
+    "VVPP": "perfect participle, full",
-    'VVPP':         'perfect participle, full',
+    "XY": "non-word containing non-letter",
    'XY':           'non-word containing non-letter',
    # Noun chunks
-
+    "NP": "noun phrase",
-    'NP':           'noun phrase',
+    "PP": "prepositional phrase",
-    'PP':           'prepositional phrase',
+    "VP": "verb phrase",
-    'VP':           'verb phrase',
+    "ADVP": "adverb phrase",
-    'ADVP':         'adverb phrase',
+    "ADJP": "adjective phrase",
-    'ADJP':         'adjective phrase',
+    "SBAR": "subordinating conjunction",
-    'SBAR':         'subordinating conjunction',
+    "PRT": "particle",
-    'PRT':          'particle',
+    "PNP": "prepositional noun phrase",
    'PNP':          'prepositional noun phrase',
    # Dependency Labels (English)
    # ClearNLP / Universal Dependencies
    # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
-
+    "acomp": "adjectival complement",
-    'acomp':        'adjectival complement',
+    "advcl": "adverbial clause modifier",
-    'advcl':        'adverbial clause modifier',
+    "advmod": "adverbial modifier",
-    'advmod':       'adverbial modifier',
+    "agent": "agent",
-    'agent':        'agent',
+    "amod": "adjectival modifier",
-    'amod':         'adjectival modifier',
+    "appos": "appositional modifier",
-    'appos':        'appositional modifier',
+    "attr": "attribute",
-    'attr':         'attribute',
+    "aux": "auxiliary",
-    'aux':          'auxiliary',
+    "auxpass": "auxiliary (passive)",
-    'auxpass':      'auxiliary (passive)',
+    "cc": "coordinating conjunction",
-    'cc':           'coordinating conjunction',
+    "ccomp": "clausal complement",
-    'ccomp':        'clausal complement',
+    "complm": "complementizer",
-    'complm':       'complementizer',
+    "conj": "conjunct",
-    'conj':         'conjunct',
+    "cop": "copula",
-    'cop':          'copula',
+    "csubj": "clausal subject",
-    'csubj':        'clausal subject',
+    "csubjpass": "clausal subject (passive)",
-    'csubjpass':    'clausal subject (passive)',
+    "dep": "unclassified dependent",
-    'dep':          'unclassified dependent',
+    "det": "determiner",
-    'det':          'determiner',
+    "dobj": "direct object",
-    'dobj':         'direct object',
+    "expl": "expletive",
-    'expl':         'expletive',
+    "hmod": "modifier in hyphenation",
-    'hmod':         'modifier in hyphenation',
+    "hyph": "hyphen",
-    'hyph':         'hyphen',
+    "infmod": "infinitival modifier",
-    'infmod':       'infinitival modifier',
+    "intj": "interjection",
-    'intj':         'interjection',
+    "iobj": "indirect object",
-    'iobj':         'indirect object',
+    "mark": "marker",
-    'mark':         'marker',
+    "meta": "meta modifier",
-    'meta':         'meta modifier',
+    "neg": "negation modifier",
-    'neg':          'negation modifier',
+    "nmod": "modifier of nominal",
-    'nmod':         'modifier of nominal',
+    "nn": "noun compound modifier",
-    'nn':           'noun compound modifier',
+    "npadvmod": "noun phrase as adverbial modifier",
-    'npadvmod':     'noun phrase as adverbial modifier',
+    "nsubj": "nominal subject",
-    'nsubj':        'nominal subject',
+    "nsubjpass": "nominal subject (passive)",
-    'nsubjpass':    'nominal subject (passive)',
+    "num": "number modifier",
-    'num':          'number modifier',
+    "number": "number compound modifier",
-    'number':       'number compound modifier',
+    "oprd": "object predicate",
-    'oprd':         'object predicate',
+    "obj": "object",
-    'obj':          'object',
+    "obl": "oblique nominal",
-    'obl':          'oblique nominal',
+    "parataxis": "parataxis",
-    'parataxis':    'parataxis',
+    "partmod": "participal modifier",
-    'partmod':      'participal modifier',
+    "pcomp": "complement of preposition",
-    'pcomp':        'complement of preposition',
+    "pobj": "object of preposition",
-    'pobj':         'object of preposition',
+    "poss": "possession modifier",
-    'poss':         'possession modifier',
+    "possessive": "possessive modifier",
-    'possessive':   'possessive modifier',
+    "preconj": "pre-correlative conjunction",
-    'preconj':      'pre-correlative conjunction',
+    "prep": "prepositional modifier",
-    'prep':         'prepositional modifier',
+    "prt": "particle",
-    'prt':          'particle',
+    "punct": "punctuation",
-    'punct':        'punctuation',
+    "quantmod": "modifier of quantifier",
-    'quantmod':     'modifier of quantifier',
+    "rcmod": "relative clause modifier",
-    'rcmod':        'relative clause modifier',
+    "root": "root",
-    'root':         'root',
+    "xcomp": "open clausal complement",
    'xcomp':        'open clausal complement',
    # Dependency labels (German)
    # TIGER Treebank
    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
    # currently missing: 'cc' (comparative complement) because of conflict
    # with English labels
-
+    "ac": "adpositional case marker",
-    'ac':           'adpositional case marker',
+    "adc": "adjective component",
-    'adc':          'adjective component',
+    "ag": "genitive attribute",
-    'ag':           'genitive attribute',
+    "ams": "measure argument of adjective",
-    'ams':          'measure argument of adjective',
+    "app": "apposition",
-    'app':          'apposition',
+    "avc": "adverbial phrase component",
-    'avc':          'adverbial phrase component',
+    "cd": "coordinating conjunction",
-    'cd':           'coordinating conjunction',
+    "cj": "conjunct",
-    'cj':           'conjunct',
+    "cm": "comparative conjunction",
-    'cm':           'comparative conjunction',
+    "cp": "complementizer",
-    'cp':           'complementizer',
+    "cvc": "collocational verb construction",
-    'cvc':          'collocational verb construction',
+    "da": "dative",
-    'da':           'dative',
+    "dh": "discourse-level head",
-    'dh':           'discourse-level head',
+    "dm": "discourse marker",
-    'dm':           'discourse marker',
+    "ep": "expletive es",
-    'ep':           'expletive es',
+    "hd": "head",
-    'hd':           'head',
+    "ju": "junctor",
-    'ju':           'junctor',
+    "mnr": "postnominal modifier",
-    'mnr':          'postnominal modifier',
+    "mo": "modifier",
-    'mo':           'modifier',
+    "ng": "negation",
-    'ng':           'negation',
+    "nk": "noun kernel element",
-    'nk':           'noun kernel element',
+    "nmc": "numerical component",
-    'nmc':          'numerical component',
+    "oa": "accusative object",
-    'oa':           'accusative object',
+    "oc": "clausal object",
-    'oc':           'clausal object',
+    "og": "genitive object",
-    'og':           'genitive object',
+    "op": "prepositional object",
-    'op':           'prepositional object',
+    "par": "parenthetical element",
-    'par':          'parenthetical element',
+    "pd": "predicate",
-    'pd':           'predicate',
+    "pg": "phrasal genitive",
-    'pg':           'phrasal genitive',
+    "ph": "placeholder",
-    'ph':           'placeholder',
+    "pm": "morphological particle",
-    'pm':           'morphological particle',
+    "pnc": "proper noun component",
-    'pnc':          'proper noun component',
+    "rc": "relative clause",
-    'rc':           'relative clause',
+    "re": "repeated element",
-    're':           'repeated element',
+    "rs": "reported speech",
-    'rs':           'reported speech',
+    "sb": "subject",
    'sb':           'subject',
    # Named Entity Recognition
    # OntoNotes 5
    # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
-
+    "PERSON": "People, including fictional",
-    'PERSON':       'People, including fictional',
+    "NORP": "Nationalities or religious or political groups",
-    'NORP':         'Nationalities or religious or political groups',
+    "FACILITY": "Buildings, airports, highways, bridges, etc.",
-    'FACILITY':     'Buildings, airports, highways, bridges, etc.',
+    "FAC": "Buildings, airports, highways, bridges, etc.",
-    'ORG':          'Companies, agencies, institutions, etc.',
+    "ORG": "Companies, agencies, institutions, etc.",
-    'GPE':          'Countries, cities, states',
+    "GPE": "Countries, cities, states",
-    'LOC':          'Non-GPE locations, mountain ranges, bodies of water',
+    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
-    'PRODUCT':      'Objects, vehicles, foods, etc. (not services)',
+    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
-    'EVENT':        'Named hurricanes, battles, wars, sports events, etc.',
+    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
-    'WORK_OF_ART':  'Titles of books, songs, etc.',
+    "WORK_OF_ART": "Titles of books, songs, etc.",
-    'LAW':          'Named documents made into laws.',
+    "LAW": "Named documents made into laws.",
-    'LANGUAGE':     'Any named language',
+    "LANGUAGE": "Any named language",
-    'DATE':         'Absolute or relative dates or periods',
+    "DATE": "Absolute or relative dates or periods",
-    'TIME':         'Times smaller than a day',
+    "TIME": "Times smaller than a day",
-    'PERCENT':      'Percentage, including "%"',
+    "PERCENT": 'Percentage, including "%"',
-    'MONEY':        'Monetary values, including unit',
+    "MONEY": "Monetary values, including unit",
-    'QUANTITY':     'Measurements, as of weight or distance',
+    "QUANTITY": "Measurements, as of weight or distance",
-    'ORDINAL':      '"first", "second", etc.',
+    "ORDINAL": '"first", "second", etc.',
-    'CARDINAL':     'Numerals that do not fall under another type',
+    "CARDINAL": "Numerals that do not fall under another type",
    # Named Entity Recognition
    # Wikipedia
    # http://www.sciencedirect.com/science/article/pii/S0004370212000276
    # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
-
+    "PER": "Named person or family.",
-    'PER':          'Named person or family.',
+    "MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
    'MISC':         ('Miscellaneous entities, e.g. events, nationalities, '
                     'products or works of art'),
 }
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -15,7 +15,7 @@ import json
 import ujson
-from . import _align 
+from . import _align
 from .syntax import nonproj
 from .tokens import Doc
 from .errors import Errors
@ -172,7 +172,7 @@ class GoldCorpus(object):
    def dev_tuples(self):
        locs = (self.tmp_dir / 'dev').iterdir()
        yield from self.read_tuples(locs, limit=self.limit)
-   
+
    @property
    def train_tuples(self):
        locs = (self.tmp_dir / 'train').iterdir()
@ -271,6 +271,53 @@ def _corrupt(c, noise_level):
        return c.lower()
 def read_json_object(json_corpus_section):
    """Take a list of JSON-formatted documents (e.g. from an already loaded
    training data file) and yield tuples in the GoldParse format.
    json_corpus_section (list): The data.
    YIELDS (tuple): The reformatted data.
    """
    for json_doc in json_corpus_section:
        tuple_doc = json_to_tuple(json_doc)
        for tuple_paragraph in tuple_doc:
            yield tuple_paragraph
 def json_to_tuple(doc):
    """Convert an item in the JSON-formatted training data to the tuple format
    used by GoldParse.
    doc (dict): One entry in the training data.
    YIELDS (tuple): The reformatted data.
    """
    paragraphs = []
    for paragraph in doc['paragraphs']:
        sents = []
        for sent in paragraph['sentences']:
            words = []
            ids = []
            tags = []
            heads = []
            labels = []
            ner = []
            for i, token in enumerate(sent['tokens']):
                words.append(token['orth'])
                ids.append(i)
                tags.append(token.get('tag', '-'))
                heads.append(token.get('head', 0) + i)
                labels.append(token.get('dep', ''))
                # Ensure ROOT label is case-insensitive
                if labels[-1].lower() == 'root':
                    labels[-1] = 'ROOT'
                ner.append(token.get('ner', '-'))
            sents.append([
                [ids, words, tags, heads, labels, ner],
                sent.get('brackets', [])])
        if sents:
            yield [paragraph.get('raw', None), sents]
 def read_json_file(loc, docs_filter=None, limit=None):
    loc = util.ensure_path(loc)
    if loc.is_dir():
@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
        for doc in _json_iterate(loc):
            if docs_filter is not None and not docs_filter(doc):
                continue
-            paragraphs = []
+            for json_tuple in json_to_tuple(doc):
-            for paragraph in doc['paragraphs']:
+                yield json_tuple
                sents = []
                for sent in paragraph['sentences']:
                    words = []
                    ids = []
                    tags = []
                    heads = []
                    labels = []
                    ner = []
                    for i, token in enumerate(sent['tokens']):
                        words.append(token['orth'])
                        ids.append(i)
                        tags.append(token.get('tag', '-'))
                        heads.append(token.get('head', 0) + i)
                        labels.append(token.get('dep', ''))
                        # Ensure ROOT label is case-insensitive
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
                    sents.append([
                        [ids, words, tags, heads, labels, ner],
                        sent.get('brackets', [])])
                if sents:
                    yield [paragraph.get('raw', None), sents]
 def _json_iterate(loc):
@ -573,32 +597,19 @@ cdef class GoldParse:
                        self.c.sent_start[i] = 0
-def docs_to_json(id, docs):
+def docs_to_json(docs, underscore=None):
-    '''Convert a list of Doc objects into the JSON-serializable format used by
+    """Convert a list of Doc objects into the JSON-serializable format used by
-    the spacy train command. Each Doc in the list will be interpreted as a
+    the spacy train command.
-    paragraph.
+
-    '''
+    docs (iterable / Doc): The Doc object(s) to convert.
    underscore (list): Optional list of string names of custom doc._.
        attributes. Attribute values need to be JSON-serializable. Values will
        be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
    RETURNS (list): The data in spaCy's JSON format.
    """
    if isinstance(docs, Doc):
        docs = [docs]
-    json_doc = {'id': id, 'paragraphs': []}
+    return [doc.to_json(underscore=underscore) for doc in docs]
    for i, doc in enumerate(docs):
        json_para = {'raw': doc.text, 'sentences': []}
        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
        for j, sent in enumerate(doc.sents):
            json_sent = {'tokens': [], 'brackets': []}
            for token in sent:
                json_token = {"id": token.i, "orth": token.text}
                if doc.is_tagged:
                    json_token['tag'] = token.tag_
                if doc.is_parsed:
                    json_token['head'] = token.head.i-token.i
                    json_token['dep'] = token.dep_
                json_token['ner'] = biluo_tags[token.i]
                json_sent['tokens'].append(json_token)
            json_para['sentences'].append(json_sent)
        json_doc['paragraphs'].append(json_para)
    return json_doc
 def biluo_tags_from_offsets(doc, entities, missing='O'):
--- a/spacy/lang/ar/init.py
+++ b/spacy/lang/ar/init.py
@ -16,16 +16,18 @@ from ...util import update_exc, add_lookups
 class ArabicDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: 'ar'
+    lex_attr_getters[LANG] = lambda text: "ar"
-    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
 class Arabic(Language):
-    lang = 'ar'
+    lang = "ar"
    Defaults = ArabicDefaults
-__all__ = ['Arabic']
+__all__ = ["Arabic"]
--- a/spacy/lang/ar/examples.py
+++ b/spacy/lang/ar/examples.py
@ -10,11 +10,11 @@ Example sentences to test spaCy and its language models.
 sentences = [
    "نال الكاتب خالد توفيق  جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
-    "أين تقع دمشق ؟"
+    "أين تقع دمشق ؟",
    "كيف حالك ؟",
    "هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
    "ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
    "هل بالإمكان أن نلتقي غدا؟",
    "هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
-    "كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم"
+    "كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم",
 ]
--- a/spacy/lang/ar/lex_attrs.py
+++ b/spacy/lang/ar/lex_attrs.py
@ -2,7 +2,8 @@
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
-_num_words = set("""
+_num_words = set(
    """
 صفر
 واحد
 إثنان
@ -52,9 +53,11 @@ _num_words = set("""
 مليون
 مليار
 مليارات
-""".split())
+""".split()
 )
-_ordinal_words = set("""
+_ordinal_words = set(
    """
 اول
 أول
 حاد
@ -69,20 +72,21 @@ _ordinal_words = set("""
 ثامن
 تاسع
 عاشر
-""".split())
+""".split()
 )
 def like_num(text):
    """
-    check if text resembles a number
+    Check if text resembles a number
    """
-    if text.startswith(('+', '-', '±', '~')):
+    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
-    text = text.replace(',', '').replace('.', '')
+    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
-    if text.count('/') == 1:
+    if text.count("/") == 1:
-        num, denom = text.split('/')
+        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
@ -92,6 +96,4 @@ def like_num(text):
    return False
-LEX_ATTRS = {
+LEX_ATTRS = {LIKE_NUM: like_num}
    LIKE_NUM: like_num
 }
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@ -1,15 +1,20 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..punctuation import TOKENIZER_INFIXES
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import UNITS, ALPHA_UPPER
-_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (
-             [r'(?<=[0-9])\+',
+    LIST_PUNCT
-              # Arabic is written from Right-To-Left
+    + LIST_ELLIPSES
-              r'(?<=[0-9])(?:{})'.format(CURRENCY),
+    + LIST_QUOTES
-              r'(?<=[0-9])(?:{})'.format(UNITS),
+    + [
-              r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)])
+        r"(?<=[0-9])\+",
        # Arabic is written from Right-To-Left
        r"(?<=[0-9])(?:{})".format(CURRENCY),
        r"(?<=[0-9])(?:{})".format(UNITS),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
 )
 TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/lang/ar/stop_words.py
+++ b/spacy/lang/ar/stop_words.py
@ -1,7 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
-STOP_WORDS = set("""
+STOP_WORDS = set(
    """
 من
 نحو
 لعل
@ -388,4 +389,5 @@ STOP_WORDS = set("""
 وإن
 ولو
 يا
-""".split())
+""".split()
 )
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@ -1,21 +1,23 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
+from ...symbols import ORTH, LEMMA
-import re
+
 _exc = {}
-# time
+
 # Time
 for exc_data in [
    {LEMMA: "قبل الميلاد", ORTH: "ق.م"},
    {LEMMA: "بعد الميلاد", ORTH: "ب. م"},
    {LEMMA: "ميلادي", ORTH: ".م"},
    {LEMMA: "هجري", ORTH: ".هـ"},
-    {LEMMA: "توفي", ORTH: ".ت"}]:
+    {LEMMA: "توفي", ORTH: ".ت"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
-# scientific abv.
+# Scientific abv.
 for exc_data in [
    {LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
    {LEMMA: "الشارح", ORTH: "الشـ"},
@ -28,20 +30,20 @@ for exc_data in [
    {LEMMA: "أنبأنا", ORTH: "أنا"},
    {LEMMA: "أخبرنا", ORTH: "نا"},
    {LEMMA: "مصدر سابق", ORTH: "م. س"},
-    {LEMMA: "مصدر نفسه", ORTH: "م. ن"}]:
+    {LEMMA: "مصدر نفسه", ORTH: "م. ن"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
-# other abv.
+# Other abv.
 for exc_data in [
    {LEMMA: "دكتور", ORTH: "د."},
    {LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
    {LEMMA: "أستاذ", ORTH: "أ."},
-    {LEMMA: "بروفيسور", ORTH: "ب."}]:
+    {LEMMA: "بروفيسور", ORTH: "ب."},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
-for exc_data in [
+for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
    {LEMMA: "تلفون", ORTH: "ت."},
    {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
    _exc[exc_data[ORTH]] = [exc_data]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -15,7 +15,7 @@ from ...util import update_exc
 class BengaliDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: 'bn'
+    lex_attr_getters[LANG] = lambda text: "bn"
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
@ -26,8 +26,8 @@ class BengaliDefaults(Language.Defaults):
 class Bengali(Language):
-    lang = 'bn'
+    lang = "bn"
    Defaults = BengaliDefaults
-__all__ = ['Bengali']
+__all__ = ["Bengali"]
--- a/spacy/lang/bn/lemmatizer.py
+++ b/spacy/lang/bn/lemmatizer.py
@ -13,11 +13,9 @@ LEMMA_RULES = {
        ["গাছা", ""],
        ["গাছি", ""],
        ["ছড়া", ""],
        ["কে", ""],
        ["ে", ""],
        ["তে", ""],
        ["র", ""],
        ["রা", ""],
        ["রে", ""],
@ -28,7 +26,6 @@ LEMMA_RULES = {
        ["গুলা", ""],
        ["গুলো", ""],
        ["গুলি", ""],
        ["কুল", ""],
        ["গণ", ""],
        ["দল", ""],
@ -45,7 +42,6 @@ LEMMA_RULES = {
        ["সকল", ""],
        ["মহল", ""],
        ["াবলি", ""],  # আবলি
        # Bengali digit representations
        ["০", "0"],
        ["১", "1"],
@ -58,11 +54,5 @@ LEMMA_RULES = {
        ["৮", "8"],
        ["৯", "9"],
    ],
-
+    "punct": [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]],
    "punct": [
        ["“", "\""],
        ["”", "\""],
        ["\u2018", "'"],
        ["\u2019", "'"]
    ]
 }
--- a/spacy/lang/bn/morph_rules.py
+++ b/spacy/lang/bn/morph_rules.py
@ -5,64 +5,253 @@ from ...symbols import LEMMA, PRON_LEMMA
 MORPH_RULES = {
-    "PRP":  {
+    "PRP": {
-        'ঐ':         {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
+        "ঐ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
-        'আমাকে':     {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
+        "আমাকে": {
-        'কি':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
+            LEMMA: PRON_LEMMA,
-        'সে':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
+            "Number": "Sing",
-        'কিসে':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
+            "Person": "One",
-        'তাকে':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
+            "PronType": "Prs",
-        'স্বয়ং':     {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
+            "Case": "Acc",
-        'কোনগুলো':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
+        },
-        'তুমি':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
+        "কি": {
-        'তুই':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
+            LEMMA: PRON_LEMMA,
-        'তাদেরকে':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
+            "Number": "Sing",
-        'আমরা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
+            "Gender": "Neut",
-        'যিনি':      {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
+            "PronType": "Int",
-        'আমাদেরকে':  {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
+            "Case": "Acc",
-        'কোন':       {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
+        },
-        'কারা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
+        "সে": {
-        'তোমাকে':    {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
+            LEMMA: PRON_LEMMA,
-        'তোকে':    {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
+            "Number": "Sing",
-        'খোদ':       {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
+            "Person": "Three",
-        'কে':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
+            "PronType": "Prs",
-        'যারা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
+            "Case": "Nom",
-        'যে':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
+        },
-        'তোমরা':     {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
+        "কিসে": {
-        'তোরা':     {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
+            LEMMA: PRON_LEMMA,
-        'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
+            "Number": "Sing",
-        'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
+            "Gender": "Neut",
-        'আপন':       {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
+            "PronType": "Int",
-        'এ':         {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
+            "Case": "Acc",
-        'নিজ':       {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
+        },
-        'কার':       {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
+        "তাকে": {
-        'যা':        {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'},
+            LEMMA: PRON_LEMMA,
-        'তারা':      {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
+            "Number": "Sing",
-        'আমি':       {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'}
+            "Person": "Three",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "স্বয়ং": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
        "কোনগুলো": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Gender": "Neut",
            "PronType": "Int",
            "Case": "Acc",
        },
        "তুমি": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Nom",
        },
        "তুই": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Nom",
        },
        "তাদেরকে": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Three",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "আমরা": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "One ",
            "PronType": "Prs",
            "Case": "Nom",
        },
        "যিনি": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
        "আমাদেরকে": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "One",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "কোন": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
        "কারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Int", "Case": "Acc"},
        "তোমাকে": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "তোকে": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "খোদ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
        "কে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
        "যারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Rel", "Case": "Nom"},
        "যে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
        "তোমরা": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Nom",
        },
        "তোরা": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Nom",
        },
        "তোমাদেরকে": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "তোদেরকে": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Two",
            "PronType": "Prs",
            "Case": "Acc",
        },
        "আপন": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
        "এ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
        "নিজ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
        "কার": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
        "যা": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Gender": "Neut",
            "PronType": "Rel",
            "Case": "Nom",
        },
        "তারা": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Three",
            "PronType": "Prs",
            "Case": "Nom",
        },
        "আমি": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "One",
            "PronType": "Prs",
            "Case": "Nom",
        },
    },
    "PRP$": {
-
+        "আমার": {
-        'আমার':    {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
+            LEMMA: PRON_LEMMA,
-                    'Case': 'Nom'},
+            "Number": "Sing",
-        'মোর':     {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
+            "Person": "One",
-                    'Case': 'Nom'},
+            "PronType": "Prs",
-        'মোদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
+            "Poss": "Yes",
-                    'Case': 'Nom'},
+            "Case": "Nom",
-        'তার':     {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
+        },
-                    'Case': 'Nom'},
+        "মোর": {
-        'তোমাদের': {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
+            LEMMA: PRON_LEMMA,
-                    'Case': 'Nom'},
+            "Number": "Sing",
-        'আমাদের':  {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
+            "Person": "One",
-                    'Case': 'Nom'},
+            "PronType": "Prs",
-        'তোমার':   {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
+            "Poss": "Yes",
-                    'Case': 'Nom'},
+            "Case": "Nom",
-        'তোর':     {LEMMA:  PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
+        },
-                    'Case': 'Nom'},
+        "মোদের": {
-        'তাদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
+            LEMMA: PRON_LEMMA,
-                    'Case': 'Nom'},
+            "Number": "Plur",
-        'কাদের':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
+            "Person": "One",
-        'তোদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
+            "PronType": "Prs",
-                    'Case': 'Nom'},
+            "Poss": "Yes",
-        'যাদের':   {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
+            "Case": "Nom",
-    }
+        },
        "তার": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Three",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "তোমাদের": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Two",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "আমাদের": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "One",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "তোমার": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Two",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "তোর": {
            LEMMA: PRON_LEMMA,
            "Number": "Sing",
            "Person": "Two",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "তাদের": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Three",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "কাদের": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "PronType": "Int",
            "Case": "Acc",
        },
        "তোদের": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "Person": "Two",
            "PronType": "Prs",
            "Poss": "Yes",
            "Case": "Nom",
        },
        "যাদের": {
            LEMMA: PRON_LEMMA,
            "Number": "Plur",
            "PronType": "Int",
            "Case": "Acc",
        },
    },
 }
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -2,30 +2,45 @@
 from __future__ import unicode_literals
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
+from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, QUOTES, UNITS
 _currency = r"\$|¢|£|€|¥|฿|৳"
-_quotes = QUOTES.replace("'", '')
+_quotes = QUOTES.replace("'", "")
-_list_punct = LIST_PUNCT + '। ॥'.strip().split()
+_list_punct = LIST_PUNCT + "। ॥".strip().split()
-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
+_prefixes = [r"\+"] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS
-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+_suffixes = (
-             [r'(?<=[0-9])\+',
+    _list_punct
-              r'(?<=°[FfCcKk])\.',
+    + LIST_ELLIPSES
-              r'(?<=[0-9])(?:{})'.format(_currency),
+    + LIST_QUOTES
-              r'(?<=[0-9])(?:{})'.format(UNITS),
+    + LIST_ICONS
-              r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
+    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{})".format(_currency),
        r"(?<=[0-9])(?:{})".format(UNITS),
        r"(?<=[{}(?:{})])\.".format(
            "|".join([ALPHA_LOWER, r"%²\-\)\]\+", QUOTES]), _currency
        ),
    ]
 )
-_infixes = (LIST_ELLIPSES + LIST_ICONS +
+_infixes = (
-            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
+    LIST_ELLIPSES
-             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+    + LIST_ICONS
-             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
+    + [
-             r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
+        r"(?<=[0-9{zero}-{nine}])[+\-\*^=](?=[0-9{zero}-{nine}-])".format(
-             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+            zero="০", nine="৯"
-             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
+        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])[{h}](?={ae})".format(a=ALPHA, h=HYPHENS, ae="এ"),
        r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
        r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA),
    ]
 )
 TOKENIZER_PREFIXES = _prefixes
--- a/spacy/lang/bn/stop_words.py
+++ b/spacy/lang/bn/stop_words.py
@ -2,43 +2,45 @@
 from __future__ import unicode_literals
-STOP_WORDS = set("""
+STOP_WORDS = set(
    """
 অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত  অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
-আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার  আমি আর আরও 
+আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার  আমি আর আরও
-ইত্যাদি ইহা 
+ইত্যাদি ইহা
 উচিত উনি উপর উপরে উত্তর
 এ এঁদের এঁরা এই এক একই একজন একটা একটি  একবার একে এখন এখনও এখানে এখানেই এটা এসো
-এটাই এটি এত এতটাই এতে এদের এবং এবার এমন এমনি এমনকি এর এরা এলো এস এসে 
+এটাই এটি এত এতটাই এতে এদের এবং এবার এমন এমনি এমনকি এর এরা এলো এস এসে
-ঐ 
+ঐ
-ও ওঁদের ওঁর ওঁরা ওই ওকে ওখানে ওদের ওর ওরা 
+ও ওঁদের ওঁর ওঁরা ওই ওকে ওখানে ওদের ওর ওরা
 কখনও কত কথা কবে কয়েক  কয়েকটি করছে করছেন করতে  করবে করবেন করলে কয়েক  কয়েকটি করিয়ে করিয়া করায়
-করলেন করা করাই করায় করার করি করিতে করিয়া করিয়ে করে করেই করেছিলেন করেছে করেছেন করেন কাউকে 
+করলেন করা করাই করায় করার করি করিতে করিয়া করিয়ে করে করেই করেছিলেন করেছে করেছেন করেন কাউকে
 কাছ কাছে কাজ কাজে কারও কারণ কি কিংবা কিছু কিছুই কিন্তু কী কে কেউ কেউই কেন কোন কোনও কোনো কেমনে কোটি
-ক্ষেত্রে খুব 
+ক্ষেত্রে খুব
 গিয়ে গিয়েছে গুলি গেছে গেল গেলে গোটা গিয়ে গিয়েছে
-চলে চান চায় চেয়ে চায় চেয়ে চার চালু চেষ্টা 
+চলে চান চায় চেয়ে চায় চেয়ে চার চালু চেষ্টা
 ছাড়া ছাড়াও ছিল ছিলেন ছাড়া ছাড়াও
 জন জনকে জনের জন্য জন্যে জানতে জানা জানানো জানায়  জানিয়ে  জানিয়েছে জানায় জাানিয়ে জানিয়েছে
-টি 
+টি
-ঠিক 
+ঠিক
-তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই 
+তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই
 তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের
 থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই  থেকেও থাকায়
-দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু  দুটি  দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয়  দেশের  
+দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু  দুটি  দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয়  দেশের
 দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই
-ধরা ধরে 
+ধরা ধরে
 নয় না নাই নাকি নাগাদ নানা নিজে নিজেই নিজেদের নিজের নিতে নিয়ে নিয়ে নেই নেওয়া নেওয়ার নয় নতুন
 পক্ষে পর পরে পরেই পরেও পর্যন্ত পাওয়া পারি পারে পারেন পেয়ে প্রতি প্রভৃতি প্রায় পাওয়া পেয়ে প্রায় পাঁচ প্রথম প্রাথমিক
-ফলে ফিরে ফের 
+ফলে ফিরে ফের
 বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন  বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি
-ভাবে  ভাবেই 
+ভাবে  ভাবেই
-মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই  মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর 
+মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই  মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর
-যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া  যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার  যারা যায় যিনি যে যেখানে যেতে যেন 
+যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া  যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার  যারা যায় যিনি যে যেখানে যেতে যেন
-যেমন 
+যেমন
-রকম রয়েছে রাখা রেখে রয়েছে 
+রকম রয়েছে রাখা রেখে রয়েছে
-লক্ষ 
+লক্ষ
-শুধু শুরু 
+শুধু শুরু
-সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে  সেই সেখান সেখানে  সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং 
+সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে  সেই সেখান সেখানে  সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
 হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
 হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া  হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
-""".split())
+""".split()
 )
--- a/spacy/lang/bn/tag_map.py
+++ b/spacy/lang/bn/tag_map.py
@ -6,72 +6,77 @@ from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
 TAG_MAP = {
-    ".":        {POS: PUNCT, "PunctType": "peri"},
+    ".": {POS: PUNCT, "PunctType": "peri"},
-    ",":        {POS: PUNCT, "PunctType": "comm"},
+    ",": {POS: PUNCT, "PunctType": "comm"},
-    "-LRB-":    {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
+    "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
-    "-RRB-":    {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
+    "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
-    "``":       {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
+    "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
-    "\"\"":     {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
+    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
-    "''":       {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
+    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
-    ":":        {POS: PUNCT},
+    ":": {POS: PUNCT},
-    "৳":        {POS: SYM, "Other": {"SymType": "currency"}},
+    "৳": {POS: SYM, "Other": {"SymType": "currency"}},
-    "#":        {POS: SYM, "Other": {"SymType": "numbersign"}},
+    "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
-    "AFX":      {POS: ADJ,  "Hyph": "yes"},
+    "AFX": {POS: ADJ, "Hyph": "yes"},
-    "CC":       {POS: CONJ, "ConjType": "coor"},
+    "CC": {POS: CONJ, "ConjType": "coor"},
-    "CD":       {POS: NUM, "NumType": "card"},
+    "CD": {POS: NUM, "NumType": "card"},
-    "DT":       {POS: DET},
+    "DT": {POS: DET},
-    "EX":       {POS: ADV, "AdvType": "ex"},
+    "EX": {POS: ADV, "AdvType": "ex"},
-    "FW":       {POS: X, "Foreign": "yes"},
+    "FW": {POS: X, "Foreign": "yes"},
-    "HYPH":     {POS: PUNCT, "PunctType": "dash"},
+    "HYPH": {POS: PUNCT, "PunctType": "dash"},
-    "IN":       {POS: ADP},
+    "IN": {POS: ADP},
-    "JJ":       {POS: ADJ, "Degree": "pos"},
+    "JJ": {POS: ADJ, "Degree": "pos"},
-    "JJR":      {POS: ADJ, "Degree": "comp"},
+    "JJR": {POS: ADJ, "Degree": "comp"},
-    "JJS":      {POS: ADJ, "Degree": "sup"},
+    "JJS": {POS: ADJ, "Degree": "sup"},
-    "LS":       {POS: PUNCT, "NumType": "ord"},
+    "LS": {POS: PUNCT, "NumType": "ord"},
-    "MD":       {POS: VERB, "VerbType": "mod"},
+    "MD": {POS: VERB, "VerbType": "mod"},
-    "NIL":      {POS: ""},
+    "NIL": {POS: ""},
-    "NN":       {POS: NOUN, "Number": "sing"},
+    "NN": {POS: NOUN, "Number": "sing"},
-    "NNP":      {POS: PROPN, "NounType": "prop", "Number": "sing"},
+    "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
-    "NNPS":     {POS: PROPN, "NounType": "prop", "Number": "plur"},
+    "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
-    "NNS":      {POS: NOUN, "Number": "plur"},
+    "NNS": {POS: NOUN, "Number": "plur"},
-    "PDT":      {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
+    "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
-    "POS":      {POS: PART, "Poss": "yes"},
+    "POS": {POS: PART, "Poss": "yes"},
-    "PRP":      {POS: PRON, "PronType": "prs"},
+    "PRP": {POS: PRON, "PronType": "prs"},
-    "PRP$":     {POS: ADJ, "PronType": "prs", "Poss": "yes"},
+    "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
-    "RB":       {POS: ADV, "Degree": "pos"},
+    "RB": {POS: ADV, "Degree": "pos"},
-    "RBR":      {POS: ADV, "Degree": "comp"},
+    "RBR": {POS: ADV, "Degree": "comp"},
-    "RBS":      {POS: ADV, "Degree": "sup"},
+    "RBS": {POS: ADV, "Degree": "sup"},
-    "RP":       {POS: PART},
+    "RP": {POS: PART},
-    "SYM":      {POS: SYM},
+    "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
-    "TO":       {POS: PART, "PartType": "inf", "VerbForm": "inf"},
+    "UH": {POS: INTJ},
-    "UH":       {POS: INTJ},
+    "VB": {POS: VERB, "VerbForm": "inf"},
-    "VB":       {POS: VERB, "VerbForm": "inf"},
+    "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
-    "VBD":      {POS: VERB, "VerbForm": "fin", "Tense": "past"},
+    "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
-    "VBG":      {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
+    "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
-    "VBN":      {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
+    "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
-    "VBP":      {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
+    "VBZ": {
-    "VBZ":      {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
+        POS: VERB,
-    "WDT":      {POS: ADJ, "PronType": "int|rel"},
+        "VerbForm": "fin",
-    "WP":       {POS: NOUN, "PronType": "int|rel"},
+        "Tense": "pres",
-    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
+        "Number": "sing",
-    "WRB":      {POS: ADV, "PronType": "int|rel"},
+        "Person": 3,
-    "SP":       {POS: SPACE},
+    },
-    "ADV":      {POS: ADV},
+    "WDT": {POS: ADJ, "PronType": "int|rel"},
-    "NOUN":     {POS: NOUN},
+    "WP": {POS: NOUN, "PronType": "int|rel"},
-    "ADP":      {POS: ADP},
+    "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
-    "PRON":     {POS: PRON},
+    "WRB": {POS: ADV, "PronType": "int|rel"},
-    "SCONJ":    {POS: SCONJ},
+    "SP": {POS: SPACE},
-    "PROPN":    {POS: PROPN},
+    "ADV": {POS: ADV},
-    "DET":      {POS: DET},
+    "NOUN": {POS: NOUN},
-    "SYM":      {POS: SYM},
+    "ADP": {POS: ADP},
-    "INTJ":     {POS: INTJ},
+    "PRON": {POS: PRON},
-    "PUNCT":    {POS: PUNCT},
+    "SCONJ": {POS: SCONJ},
-    "NUM":      {POS: NUM},
+    "PROPN": {POS: PROPN},
-    "AUX":      {POS: AUX},
+    "DET": {POS: DET},
-    "X":        {POS: X},
+    "SYM": {POS: SYM},
-    "CONJ":     {POS: CONJ},
+    "INTJ": {POS: INTJ},
-    "CCONJ":    {POS: CCONJ},
+    "PUNCT": {POS: PUNCT},
-    "ADJ":      {POS: ADJ},
+    "NUM": {POS: NUM},
-    "VERB":     {POS: VERB},
+    "AUX": {POS: AUX},
-    "PART":     {POS: PART},
+    "X": {POS: X},
    "CONJ": {POS: CONJ},
    "CCONJ": {POS: CCONJ},
    "ADJ": {POS: ADJ},
    "VERB": {POS: VERB},
    "PART": {POS: PART},
 }
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@ -19,7 +19,8 @@ for exc_data in [
    {ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
    {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
    {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
-    {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
+    {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -0,0 +1,33 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 class CatalanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: "ca"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    lemma_lookup = LOOKUP
 class Catalan(Language):
    lang = "ca"
    Defaults = CatalanDefaults
 __all__ = ["Catalan"]
--- a/spacy/lang/ca/examples.py
+++ b/spacy/lang/ca/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.ca.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars",
    "Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants",
    "San Francisco analitza prohibir els robots de repartiment",
    "Londres és una gran ciutat del Regne Unit",
    "El gat menja peix",
    "Veig a l'home amb el telescopi",
    "L'Aranya menja mosques",
    "El pingüí incuba en el seu niu",
 ]
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,2 @@`
							`from .conll17_ud_eval import main as ud_evaluate # noqa: F401`
							`from .ud_train import main as ud_train # noqa: F401`