diff --git a/.github/contributors/GuiGel.md b/.github/contributors/GuiGel.md new file mode 100644 index 000000000..43fb0f757 --- /dev/null +++ b/.github/contributors/GuiGel.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Guillaume Gelabert | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-11-15 | +| GitHub username | GuiGel | +| Website (optional) | | diff --git a/.github/contributors/Olamyy.md b/.github/contributors/Olamyy.md new file mode 100644 index 000000000..711144825 --- /dev/null +++ b/.github/contributors/Olamyy.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ x ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Olamilekan Wahab | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 8/11/2019 | +| GitHub username | Olamyy | +| Website (optional) | | diff --git a/.github/contributors/aajanki.md b/.github/contributors/aajanki.md new file mode 100644 index 000000000..de7dc7fa2 --- /dev/null +++ b/.github/contributors/aajanki.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Antti Ajanki | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-11-30 | +| GitHub username | aajanki | +| Website (optional) | | diff --git a/.github/contributors/erip.md b/.github/contributors/erip.md new file mode 100644 index 000000000..56df07338 --- /dev/null +++ b/.github/contributors/erip.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Elijah Rippeth | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-11-16 | +| GitHub username | erip | +| Website (optional) | | diff --git a/.github/contributors/mmaybeno.md b/.github/contributors/mmaybeno.md new file mode 100644 index 000000000..603cd5bba --- /dev/null +++ b/.github/contributors/mmaybeno.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matt Maybeno | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-11-19 | +| GitHub username | mmaybeno | +| Website (optional) | | diff --git a/.github/contributors/mr-bjerre.md b/.github/contributors/mr-bjerre.md new file mode 100644 index 000000000..ad3695bef --- /dev/null +++ b/.github/contributors/mr-bjerre.md @@ -0,0 +1,87 @@ +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Nicolai Bjerre Pedersen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-12-06 | +| GitHub username | mr_bjerre | +| Website (optional) | | diff --git a/.github/contributors/questoph.md b/.github/contributors/questoph.md new file mode 100644 index 000000000..24559c098 --- /dev/null +++ b/.github/contributors/questoph.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Christoph Purschke | +| Company name (if applicable) | University of Luxembourg | +| Title or role (if applicable) | | +| Date | 14/11/2019 | +| GitHub username | questoph | +| Website (optional) | https://purschke.info | diff --git a/README.md b/README.md index 980fc5b0b..74d2d2166 100644 --- a/README.md +++ b/README.md @@ -72,21 +72,21 @@ it. ## Features -- Non-destructive **tokenization** -- **Named entity** recognition -- Support for **50+ languages** -- pretrained [statistical models](https://spacy.io/models) and word vectors -- State-of-the-art speed -- Easy **deep learning** integration -- Part-of-speech tagging -- Labelled dependency parsing -- Syntax-driven sentence segmentation -- Built in **visualizers** for syntax and NER -- Convenient string-to-hash mapping -- Export to numpy data arrays -- Efficient binary serialization -- Easy **model packaging** and deployment -- Robust, rigorously evaluated accuracy +- Non-destructive **tokenization** +- **Named entity** recognition +- Support for **50+ languages** +- pretrained [statistical models](https://spacy.io/models) and word vectors +- State-of-the-art speed +- Easy **deep learning** integration +- Part-of-speech tagging +- Labelled dependency parsing +- Syntax-driven sentence segmentation +- Built in **visualizers** for syntax and NER +- Convenient string-to-hash mapping +- Export to numpy data arrays +- Efficient binary serialization +- Easy **model packaging** and deployment +- Robust, rigorously evaluated accuracy 📖 **For more details, see the [facts, figures and benchmarks](https://spacy.io/usage/facts-figures).** @@ -96,10 +96,10 @@ it. For detailed installation instructions, see the [documentation](https://spacy.io/usage). -- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual - Studio) -- **Python version**: Python 2.7, 3.5+ (only 64 bit) -- **Package managers**: [pip] · [conda] (via `conda-forge`) +- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual + Studio) +- **Python version**: Python 2.7, 3.5+ (only 64 bit) +- **Package managers**: [pip] · [conda] (via `conda-forge`) [pip]: https://pypi.org/project/spacy/ [conda]: https://anaconda.org/conda-forge/spacy diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 512c6414c..054365336 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,24 +35,12 @@ jobs: dependsOn: 'Validate' strategy: matrix: - # Python 2.7 currently doesn't work because it seems to be a narrow - # unicode build, which causes problems with the regular expressions - - # Python27Linux: - # imageName: 'ubuntu-16.04' - # python.version: '2.7' - # Python27Mac: - # imageName: 'macos-10.13' - # python.version: '2.7' Python35Linux: imageName: 'ubuntu-16.04' python.version: '3.5' Python35Windows: imageName: 'vs2017-win2016' python.version: '3.5' - Python35Mac: - imageName: 'macos-10.13' - python.version: '3.5' Python36Linux: imageName: 'ubuntu-16.04' python.version: '3.6' @@ -62,15 +50,25 @@ jobs: Python36Mac: imageName: 'macos-10.13' python.version: '3.6' - Python37Linux: + # Don't test on 3.7 for now to speed up builds + # Python37Linux: + # imageName: 'ubuntu-16.04' + # python.version: '3.7' + # Python37Windows: + # imageName: 'vs2017-win2016' + # python.version: '3.7' + # Python37Mac: + # imageName: 'macos-10.13' + # python.version: '3.7' + Python38Linux: imageName: 'ubuntu-16.04' - python.version: '3.7' - Python37Windows: + python.version: '3.8' + Python38Windows: imageName: 'vs2017-win2016' - python.version: '3.7' - Python37Mac: + python.version: '3.8' + Python38Mac: imageName: 'macos-10.13' - python.version: '3.7' + python.version: '3.8' maxParallel: 4 pool: vmImage: $(imageName) @@ -81,10 +79,8 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' - # Downgrading pip is necessary to prevent a wheel version incompatiblity. - # Might be fixed in the future or some other way, so investigate again. - script: | - python -m pip install -U pip==18.1 setuptools + python -m pip install -U setuptools pip install -r requirements.txt displayName: 'Install dependencies' diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 75bf55771..bda22088d 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -8,6 +8,7 @@ import plac from pathlib import Path import re import json +import tqdm import spacy import spacy.util @@ -225,6 +226,13 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def write_conllu(docs, file_): + if not Token.has_extension("get_conllu_lines"): + Token.set_extension("get_conllu_lines", method=get_token_conllu) + if not Token.has_extension("begins_fused"): + Token.set_extension("begins_fused", default=False) + if not Token.has_extension("inside_fused"): + Token.set_extension("inside_fused", default=False) + merger = Matcher(docs[0].vocab) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): @@ -483,8 +491,9 @@ def main( vectors_dir=None, use_oracle_segments=False, ): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py index 94bafbf30..273ade0cd 100644 --- a/bin/wiki_entity_linking/entity_linker_evaluation.py +++ b/bin/wiki_entity_linking/entity_linker_evaluation.py @@ -1,6 +1,7 @@ import logging import random +from tqdm import tqdm from collections import defaultdict logger = logging.getLogger(__name__) @@ -119,8 +120,6 @@ def get_eval_results(data, el_pipe=None): Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. If the docs in the data require further processing with an entity linker, set el_pipe. """ - from tqdm import tqdm - docs = [] golds = [] for d, g in tqdm(data, leave=False): diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 25e914b32..19df0cf10 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -6,6 +6,7 @@ import bz2 import logging import random import json +from tqdm import tqdm from functools import partial @@ -457,9 +458,6 @@ def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None): """ This method provides training examples that correspond to the entity annotations found by the nlp object. For training, it will include both positive and negative examples by using the candidate generator from the kb. For testing (kb=None), it will include all positive examples only.""" - - from tqdm import tqdm - if not labels_discard: labels_discard = [] diff --git a/examples/training/conllu.py b/examples/training/conllu.py index 08febda50..bf47be72a 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -7,6 +7,7 @@ import attr from pathlib import Path import re import json +import tqdm import spacy import spacy.util @@ -291,11 +292,6 @@ def get_token_conllu(token, i): return "\n".join(lines) -Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True) -Token.set_extension("begins_fused", default=False, force=True) -Token.set_extension("inside_fused", default=False, force=True) - - ################## # Initialization # ################## @@ -394,8 +390,9 @@ class TreebankPaths(object): limit=("Size limit", "option", "n", int), ) def main(ud_dir, parses_dir, config, corpus, limit=0): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm + Token.set_extension("get_conllu_lines", method=get_token_conllu) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) @@ -426,10 +423,7 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): for batch in batches: pbar.update(sum(len(ex.doc) for ex in batch)) nlp.update( - examples=batch, - sgd=optimizer, - drop=config.dropout, - losses=losses, + examples=batch, sgd=optimizer, drop=config.dropout, losses=losses, ) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py index a69e97e14..db6442ad4 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/pretrain_kb.py @@ -8,8 +8,8 @@ For more details, see the documentation: * Knowledge base: https://spacy.io/api/kb * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2 -Last tested with: v2.2 +Compatible with: spaCy v2.2.3 +Last tested with: v2.2.3 """ from __future__ import unicode_literals, print_function diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index 828479881..64f7002ef 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -14,6 +14,7 @@ pre-train with the development data, but also not *so* terrible: we're not using the development labels, after all --- only the unlabelled text. """ import plac +import tqdm import random import spacy import thinc.extra.datasets @@ -106,9 +107,6 @@ def create_pipeline(width, embed_size, vectors_model): def train_tensorizer(nlp, texts, dropout, n_iter): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - tensorizer = nlp.create_pipe("tensorizer") nlp.add_pipe(tensorizer) optimizer = nlp.begin_training() @@ -122,9 +120,6 @@ def train_tensorizer(nlp, texts, dropout, n_iter): def train_textcat(nlp, n_texts, n_iter=10): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - textcat = nlp.get_pipe("textcat") tok2vec_weights = textcat.model.tok2vec.to_bytes() (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index 9d7357b8c..f44c3b9cc 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -8,8 +8,8 @@ For more details, see the documentation: * Training: https://spacy.io/usage/training * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2 -Last tested with: v2.2 +Compatible with: spaCy v2.2.3 +Last tested with: v2.2.3 """ from __future__ import unicode_literals, print_function @@ -22,6 +22,7 @@ from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase +from spacy.pipeline import EntityRuler from spacy.tokens import Span from spacy.util import minibatch, compounding @@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) - # create the built-in pipeline components and add them to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy + # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. + nlp.add_pipe(nlp.create_pipe('sentencizer')) + + # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. + # Note that in a realistic application, an actual NER algorithm should be used instead. + ruler = EntityRuler(nlp) + patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: - entity_linker = nlp.create_pipe("entity_linker") + # use only the predicted EL score and not the prior probability (for demo purposes) + cfg = {"incl_prior": False} + entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) - else: - entity_linker = nlp.get_pipe("entity_linker") - kb = entity_linker.kb - # make sure the annotated examples correspond to known identifiers in the knowlege base - kb_ids = kb.get_entity_strings() + # Convert the texts to docs to make sure we have doc.ents set for the training examples. + # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. + kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() + TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: + with nlp.disable_pipes("entity_linker"): + doc = nlp(text) + annotation_clean = annotation for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): @@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): print( "Removed", kb_id, "from training because it is not in the KB." ) - annotation["links"][offset] = new_dict + annotation_clean["links"][offset] = new_dict + TRAIN_DOCS.append((doc, annotation_clean)) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] @@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(TRAIN_DOCS) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update( batch, @@ -136,16 +151,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): def _apply_model(nlp): for text, annotation in TRAIN_DATA: - doc = nlp.tokenizer(text) - - # set entities so the evaluation is independent of the NER step - # all the examples contain 'Russ Cochran' as the first two tokens in the sentence - rc_ent = Span(doc, 0, 2, label=PERSON) - doc.ents = [rc_ent] - # apply the entity linker which will now make predictions for the 'Russ Cochran' entities - doc = nlp.get_pipe("entity_linker")(doc) - + doc = nlp(text) print() print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) diff --git a/examples/vectors_tensorboard.py b/examples/vectors_tensorboard.py index b1160888d..72eda1edc 100644 --- a/examples/vectors_tensorboard.py +++ b/examples/vectors_tensorboard.py @@ -8,6 +8,7 @@ from __future__ import unicode_literals from os import path +import tqdm import math import numpy import plac @@ -35,9 +36,6 @@ from tensorflow.contrib.tensorboard.plugins.projector import ( ), ) def main(vectors_loc, out_loc, name="spaCy_vectors"): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - meta_file = "{}.tsv".format(name) out_meta_file = path.join(out_loc, meta_file) diff --git a/requirements.txt b/requirements.txt index 12f19bb88..1786ee186 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=7.3.0,<7.4.0 +thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 @@ -12,6 +12,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 pathlib==1.0.1; python_version < "3.4" +tqdm>=4.38.0,<5.0.0 # Optional dependencies jsonschema>=2.6.0,<3.1.0 # Development dependencies diff --git a/setup.cfg b/setup.cfg index 940066a9e..a0103c5a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=7.3.0,<7.4.0 + thinc==7.4.0.dev0 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=7.3.0,<7.4.0 + thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=0.1.0,<1.1.0 @@ -73,7 +73,7 @@ cuda100 = cupy-cuda100>=5.0.0b4 # Language tokenizers with external dependencies ja = - mecab-python3==0.7 + fugashi>=0.1.3 ko = natto-py==0.9.0 th = diff --git a/spacy/about.py b/spacy/about.py index c6db9700f..a1880fb54 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.2" +__version__ = "2.2.3" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 0699bb5c1..3febd07d1 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -7,8 +7,9 @@ from spacy.gold import Example from ...gold import iob_to_biluo -def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, - ner_map=None, **_): +def conllu2json( + input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_ +): """ Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is @@ -29,13 +30,19 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, has_ner_tags = False for i, example in enumerate(conll_data): if not checked_for_ner: - has_ner_tags = is_ner(example.token_annotation.entities[0], - MISC_NER_PATTERN) + has_ner_tags = is_ner( + example.token_annotation.entities[0], MISC_NER_PATTERN + ) checked_for_ner = True raw += example.text - sentences.append(generate_sentence(example.token_annotation, - has_ner_tags, MISC_NER_PATTERN, - ner_map=ner_map)) + sentences.append( + generate_sentence( + example.token_annotation, + has_ner_tags, + MISC_NER_PATTERN, + ner_map=ner_map, + ) + ) # Real-sized documents could be extracted using the comments on the # conllu document if len(sentences) % n_sents == 0: @@ -105,8 +112,9 @@ def read_conllx(input_data, use_morphology=False, n=0): if space: raw += " " example = Example(doc=raw) - example.set_token_annotation(ids=ids, words=words, tags=tags, - heads=heads, deps=deps, entities=ents) + example.set_token_annotation( + ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents + ) yield example i += 1 if 1 <= n <= i: @@ -143,13 +151,11 @@ def extract_tags(iob, tag_pattern, ner_map=None): return new_iob -def generate_sentence(token_annotation, has_ner_tags, tag_pattern, - ner_map=None): +def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None): sentence = {} tokens = [] if has_ner_tags: - iob = extract_tags(token_annotation.entities, tag_pattern, - ner_map=ner_map) + iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map) biluo = iob_to_biluo(iob) for i, id in enumerate(token_annotation.ids): token = {} diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index cda21cbcc..3fa0cc890 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import plac import math +from tqdm import tqdm import numpy from ast import literal_eval from pathlib import Path @@ -116,9 +117,6 @@ def open_file(loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - if freqs_loc is not None: with msg.loading("Counting frequencies..."): probs, _ = read_freqs(freqs_loc) @@ -201,9 +199,6 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): def read_vectors(vectors_loc): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) vectors_data = numpy.zeros(shape=shape, dtype="f") @@ -220,9 +215,6 @@ def read_vectors(vectors_loc): def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - counts = PreshCounter() total = 0 with freqs_loc.open() as f: @@ -252,9 +244,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_clusters(clusters_loc): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - clusters = {} if ftfy is None: user_warning(Warnings.W004) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 4995224f3..4ee72fc23 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, division, print_function import plac +import tqdm from pathlib import Path import srsly import cProfile @@ -46,9 +47,6 @@ def profile(model, inputs=None, n_texts=10000): def parse_texts(nlp, texts): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass diff --git a/spacy/cli/train.py b/spacy/cli/train.py index cdcbed0b3..daa90f022 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals, division, print_function import plac import os +import tqdm from pathlib import Path from thinc.neural._classes.model import Model from timeit import default_timer as timer @@ -88,10 +89,6 @@ def train( JSON format. To convert data from other formats, use the `spacy convert` command. """ - - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - util.fix_random_seed() util.set_env_log(verbose) @@ -524,9 +521,6 @@ def _score_for_model(meta): @contextlib.contextmanager def _create_progress_bar(total): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - if int(os.environ.get("LOG_FRIENDLY", 0)): yield else: diff --git a/spacy/errors.py b/spacy/errors.py index 0b6a6775c..ebbd314cd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -53,7 +53,9 @@ class Warnings(object): W009 = ("Custom factory '{name}' provided by entry points of another " "package overwrites built-in factory.") W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length " - "limit anymore, so the max_length argument is now deprecated.") + "limit anymore, so the max_length argument is now deprecated. " + "If you did not specify this parameter, make sure you call the " + "constructor with named arguments instead of positional ones.") W011 = ("It looks like you're calling displacy.serve from within a " "Jupyter notebook or a similar environment. This likely means " "you're already running a local web server, so there's no need to " @@ -72,7 +74,7 @@ class Warnings(object): "instead.") W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization " "methods is and should be replaced with `exclude`. This makes it " - "consistent with the other objects serializable.") + "consistent with the other serializable objects.") W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from " "being serialized or deserialized is deprecated. Please use the " "`exclude` argument instead. For example: exclude=['{arg}'].") @@ -81,7 +83,8 @@ class Warnings(object): "Future versions may introduce a `n_process` argument for " "parallel inference via multiprocessing.") W017 = ("Alias '{alias}' already exists in the Knowledge Base.") - W018 = ("Entity '{entity}' already exists in the Knowledge Base.") + W018 = ("Entity '{entity}' already exists in the Knowledge Base - " + "ignoring the duplicate entry.") W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " "previously loaded vectors. See Issue #3853.") W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " @@ -101,6 +104,7 @@ class Warnings(object): "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " "previous components in the pipeline declare that they assign it.") + W026 = ("Unable to set all sentence boundaries from dependency parses.") @add_codes @@ -529,17 +533,19 @@ class Errors(object): E185 = ("Received invalid attribute in component attribute declaration: " "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.") - E187 = ("Tokenizer special cases are not allowed to modify the text. " + E187 = ("Only unicode strings are supported as labels.") + E188 = ("Could not match the gold entity links to entities in the doc - " + "make sure the gold EL data refers to valid results of the " + "named entity recognizer in the `nlp` pipeline.") + # TODO: fix numbering after merging develop into master + E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - - # TODO: fix numbering after merging develop into master E998 = ("Can only create GoldParse's from Example's without a Doc, " "if get_gold_parses() is called with a Vocab object.") E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") - @add_codes class TempErrors(object): T003 = ("Resizing pretrained Tagger models is not currently supported.") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 00ae7c5e8..d3316c5d0 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1121,7 +1121,7 @@ cdef class GoldParse: return not nonproj.is_nonproj_tree(self.heads) -def docs_to_json(docs, id=0): +def docs_to_json(docs, id=0, ner_missing_tag="O"): """Convert a list of Doc objects into the JSON-serializable format used by the spacy train command. @@ -1139,7 +1139,7 @@ def docs_to_json(docs, id=0): json_cat = {"label": cat, "value": val} json_para["cats"].append(json_cat) ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) + biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) for j, sent in enumerate(doc.sents): json_sent = {"tokens": [], "brackets": []} for token in sent: diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 31fd1706e..63eb41b42 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -136,29 +136,34 @@ cdef class KnowledgeBase: if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): raise ValueError(Errors.E140) - nr_entities = len(entity_list) + nr_entities = len(set(entity_list)) self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) i = 0 cdef KBEntryC entry cdef hash_t entity_hash - while i < nr_entities: - entity_vector = vector_list[i] - if len(entity_vector) != self.entity_vector_length: - raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) - + while i < len(entity_list): + # only process this entity if its unique ID hadn't been added before entity_hash = self.vocab.strings.add(entity_list[i]) - entry.entity_hash = entity_hash - entry.freq = freq_list[i] + if entity_hash in self._entry_index: + user_warning(Warnings.W018.format(entity=entity_list[i])) - vector_index = self.c_add_vector(entity_vector=vector_list[i]) - entry.vector_index = vector_index + else: + entity_vector = vector_list[i] + if len(entity_vector) != self.entity_vector_length: + raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) - entry.feats_row = -1 # Features table currently not implemented + entry.entity_hash = entity_hash + entry.freq = freq_list[i] - self._entries[i+1] = entry - self._entry_index[entity_hash] = i+1 + vector_index = self.c_add_vector(entity_vector=vector_list[i]) + entry.vector_index = vector_index + + entry.feats_row = -1 # Features table currently not implemented + + self._entries[i+1] = entry + self._entry_index[entity_hash] = i+1 i += 1 diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 5ed2a2a8c..2c8823867 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -31,6 +31,10 @@ _latin_u_supplement = r"\u00C0-\u00D6\u00D8-\u00DE" _latin_l_supplement = r"\u00DF-\u00F6\u00F8-\u00FF" _latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF" +_hangul_syllables = r"\uAC00-\uD7AF" +_hangul_jamo = r"\u1100-\u11FF" +_hangul = _hangul_syllables + _hangul_jamo + # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh _latin_u_extendedA = ( r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" @@ -202,7 +206,15 @@ _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower _uncased = ( - _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu + _bengali + + _hebrew + + _persian + + _sinhala + + _hindi + + _kannada + + _tamil + + _telugu + + _hangul ) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py index 073849c23..30816dbe4 100644 --- a/spacy/lang/el/tag_map.py +++ b/spacy/lang/el/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, PRON +from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX TAG_MAP = { @@ -4249,4 +4249,20 @@ TAG_MAP = { "Voice": "Act", "Case": "Nom|Gen|Dat|Acc|Voc", }, + 'ADJ': {POS: ADJ}, + 'ADP': {POS: ADP}, + 'ADV': {POS: ADV}, + 'AtDf': {POS: DET}, + 'AUX': {POS: AUX}, + 'CCONJ': {POS: CCONJ}, + 'DET': {POS: DET}, + 'NOUN': {POS: NOUN}, + 'NUM': {POS: NUM}, + 'PART': {POS: PART}, + 'PRON': {POS: PRON}, + 'PROPN': {POS: PROPN}, + 'SCONJ': {POS: SCONJ}, + 'SYM': {POS: SYM}, + 'VERB': {POS: VERB}, + 'X': {POS: X}, } diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index e6b93e318..7a7c9d549 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -305,6 +305,9 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", POS: VERB}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", POS: VERB}, "X___": {"morph": "_", POS: X}, + "___PunctType=Quot": {POS: PUNCT}, + "___VerbForm=Inf": {POS: VERB}, + "___Number=Sing|Person=2|PronType=Prs": {POS: PRON}, "_SP": {"morph": "_", POS: SPACE}, } # fmt: on diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 6debe999c..45d2f886f 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -13,10 +15,13 @@ from ...util import update_exc, add_lookups class FinnishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "fi" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py index 97c876837..e960b55eb 100644 --- a/spacy/lang/fi/lex_attrs.py +++ b/spacy/lang/fi/lex_attrs.py @@ -18,7 +18,8 @@ _num_words = [ "kymmenen", "yksitoista", "kaksitoista", - "kolmetoista" "neljätoista", + "kolmetoista", + "neljätoista", "viisitoista", "kuusitoista", "seitsemäntoista", diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py new file mode 100644 index 000000000..02eb1b200 --- /dev/null +++ b/spacy/lang/fi/punctuation.py @@ -0,0 +1,33 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4439376c8..4fa931fde 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -5,7 +5,7 @@ from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA -ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") +ELISION = " ' ’ ".strip().replace(" ", "") _infixes = TOKENIZER_INFIXES + [ diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 056a6893b..0538461a3 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -12,21 +12,23 @@ from ...tokens import Doc from ...compat import copy_reg from ...util import DummyTokenizer +# Handling for multiple spaces in a row is somewhat awkward, this simplifies +# the flow by creating a dummy with the same interface. +DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) +DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) +DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' ')) -ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) - - -def try_mecab_import(): - """Mecab is required for Japanese support, so check for it. +def try_fugashi_import(): + """Fugashi is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it.""" try: - import MeCab + import fugashi - return MeCab + return fugashi except ImportError: raise ImportError( - "Japanese support requires MeCab: " - "https://github.com/SamuraiT/mecab-python3" + "Japanese support requires Fugashi: " + "https://github.com/polm/fugashi" ) @@ -39,7 +41,7 @@ def resolve_pos(token): """ # this is only used for consecutive ascii spaces - if token.pos == "空白": + if token.surface == " ": return "空白" # TODO: This is a first take. The rules here are crude approximations. @@ -53,55 +55,45 @@ def resolve_pos(token): return token.pos + ",ADJ" return token.pos +def get_words_and_spaces(tokenizer, text): + """Get the individual tokens that make up the sentence and handle white space. + + Japanese doesn't usually use white space, and MeCab's handling of it for + multiple spaces in a row is somewhat awkward. + """ + + tokens = tokenizer.parseToNodeList(text) -def detailed_tokens(tokenizer, text): - """Format Mecab output into a nice data structure, based on Janome.""" - node = tokenizer.parseToNode(text) - node = node.next # first node is beginning of sentence and empty, skip it words = [] spaces = [] - while node.posid != 0: - surface = node.surface - base = surface # a default value. Updated if available later. - parts = node.feature.split(",") - pos = ",".join(parts[0:4]) - if len(parts) > 7: - # this information is only available for words in the tokenizer - # dictionary - base = parts[7] - words.append(ShortUnitWord(surface, base, pos)) - - # The way MeCab stores spaces is that the rlength of the next token is - # the length of that token plus any preceding whitespace, **in bytes**. - # also note that this is only for half-width / ascii spaces. Full width - # spaces just become tokens. - scount = node.next.rlength - node.next.length - spaces.append(bool(scount)) - while scount > 1: - words.append(ShortUnitWord(" ", " ", "空白")) + for token in tokens: + # If there's more than one space, spaces after the first become tokens + for ii in range(len(token.white_space) - 1): + words.append(DummySpace) spaces.append(False) - scount -= 1 - node = node.next + words.append(token) + spaces.append(bool(token.white_space)) return words, spaces - class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_mecab_import().Tagger() - self.tokenizer.parseToNode("") # see #2901 + self.tokenizer = try_fugashi_import().Tagger() + self.tokenizer.parseToNodeList("") # see #2901 def __call__(self, text): - dtokens, spaces = detailed_tokens(self.tokenizer, text) + dtokens, spaces = get_words_and_spaces(self.tokenizer, text) words = [x.surface for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) - mecab_tags = [] + unidic_tags = [] for token, dtoken in zip(doc, dtokens): - mecab_tags.append(dtoken.pos) + unidic_tags.append(dtoken.pos) token.tag_ = resolve_pos(dtoken) - token.lemma_ = dtoken.lemma - doc.user_data["mecab_tags"] = mecab_tags + + # if there's no lemma info (it's an unk) just use the surface + token.lemma_ = dtoken.feature.lemma or dtoken.surface + doc.user_data["unidic_tags"] = unidic_tags return doc @@ -131,5 +123,4 @@ def pickle_japanese(instance): copy_reg.pickle(Japanese, pickle_japanese) - __all__ = ["Japanese"] diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py new file mode 100644 index 000000000..1904a0ece --- /dev/null +++ b/spacy/lang/ko/lex_attrs.py @@ -0,0 +1,67 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "영", + "공", + # Native Korean number system + "하나", + "둘", + "셋", + "넷", + "다섯", + "여섯", + "일곱", + "여덟", + "아홉", + "열", + "스물", + "서른", + "마흔", + "쉰", + "예순", + "일흔", + "여든", + "아흔", + # Sino-Korean number system + "일", + "이", + "삼", + "사", + "오", + "육", + "칠", + "팔", + "구", + "십", + "백", + "천", + "만", + "십만", + "백만", + "천만", + "일억", + "십억", + "백억", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if any(char.lower() in _num_words for char in text): + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 87058bdea..4fcfaddb4 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS @@ -24,6 +25,7 @@ class LuxembourgishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES class Luxembourgish(Language): diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py index 101102ca4..7063e6863 100644 --- a/spacy/lang/lb/norm_exceptions.py +++ b/spacy/lang/lb/norm_exceptions.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals # variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) # here one could include the most common spelling mistakes -_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"} +_exc = {"dass": "datt", "viläicht": "vläicht"} NORM_EXCEPTIONS = {} diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py new file mode 100644 index 000000000..68531d9d0 --- /dev/null +++ b/spacy/lang/lb/punctuation.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER + +ELISION = " ' ’ ".strip().replace(" ", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[0-9])-(?=[0-9])", + ] +) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index adff9da36..8a35b6fb7 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -2,33 +2,17 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, NORM -from ..punctuation import TOKENIZER_PREFIXES # TODO -# tokenize cliticised definite article "d'" as token of its own: d'Kanner > [d'] [Kanner] # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) -# how to write the tokenisation exeption for the articles d' / D' ? This one is not working. -_prefixes = [ - prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’"] -] - - -_exc = { - "d'mannst": [ - {ORTH: "d'", LEMMA: "d'"}, - {ORTH: "mannst", LEMMA: "mann", NORM: "mann"}, - ], - "d'éischt": [ - {ORTH: "d'", LEMMA: "d'"}, - {ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}, - ], -} +_exc = {} # translate / delete what is not necessary -# what does PRON_LEMMA mean? for exc_data in [ - {ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"}, + {ORTH: "'t", LEMMA: "et", NORM: "et"}, + {ORTH: "'T", LEMMA: "et", NORM: "et"}, + {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"}, {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"}, {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"}, @@ -36,7 +20,7 @@ for exc_data in [ {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, - {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, + {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"} ]: _exc[exc_data[ORTH]] = [exc_data] @@ -64,6 +48,4 @@ for orth in [ ]: _exc[orth] = [{ORTH: orth}] - -TOKENIZER_PREFIXES = _prefixes TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py index 183c40c97..cf4c95840 100644 --- a/spacy/lang/nb/tag_map.py +++ b/spacy/lang/nb/tag_map.py @@ -1,12 +1,12 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X +from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X from ...symbols import VERB, NOUN, PROPN, PART, INTJ, PRON, AUX -# Tags are a combination of POS and morphological features from a yet -# unpublished dataset developed by Schibsted, Nasjonalbiblioteket and LTG. The +# Tags are a combination of POS and morphological features from a +# https://github.com/ltgoslo/norne developed by Schibsted, Nasjonalbiblioteket and LTG. The # data format is .conllu and follows the Universal Dependencies annotation. # (There are some annotation differences compared to this dataset: # https://github.com/UniversalDependencies/UD_Norwegian-Bokmaal @@ -467,4 +467,97 @@ TAG_MAP = { "VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB}, "VERB___": {"morph": "_", POS: VERB}, "X___": {"morph": "_", POS: X}, + 'CCONJ___': {"morph": "_", POS: CCONJ}, + "ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ}, + "ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ}, + "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ}, + "ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP}, + "ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV}, + "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET}, + "DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET}, + "DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET}, + "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, + "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET}, + "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET}, + "DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET}, + "DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET}, + "DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, + "DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET}, + "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, + "DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET}, + "DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET}, + "DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET}, + "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, + "DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET}, + "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET}, + "DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET}, + "DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET}, + "DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET}, + "DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET}, + "DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET}, + "DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET}, + "DET__Number=Plur|PronType=Tot": {"morph": "Number=Plur|PronType=Tot", POS: DET}, + "DET__PronType=Ind": {"morph": "PronType=Ind", POS: DET}, + "DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET}, + "NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN}, + "NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN}, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN}, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN}, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN}, + "NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN}, + "NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM}, + "NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM}, + "NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM}, + "NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM}, + "NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM}, + "PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART}, + "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON}, + "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON}, + "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON}, + "PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON}, + "PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON}, + "PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON}, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON}, + "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, + "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, + "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, + "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, + "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, + "PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON}, + "PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON}, + "PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON}, + "PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON}, + "PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON}, + "PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON}, + "PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON}, + "PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN}, + "PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN}, + "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB}, + "VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB}, } diff --git a/spacy/lang/pt/tag_map.py b/spacy/lang/pt/tag_map.py index 51c2b20b5..cdc7de57e 100644 --- a/spacy/lang/pt/tag_map.py +++ b/spacy/lang/pt/tag_map.py @@ -5039,5 +5039,19 @@ TAG_MAP = { "punc": {POS: PUNCT}, "v-pcp|M|P": {POS: VERB}, "v-pcp|M|S": {POS: VERB}, + "ADJ": {POS: ADJ}, + "AUX": {POS: AUX}, + "CCONJ": {POS: CCONJ}, + "DET": {POS: DET}, + "INTJ": {POS: INTJ}, + "NUM": {POS: NUM}, + "PART": {POS: PART}, + "PRON": {POS: PRON}, + "PUNCT": {POS: PUNCT}, + "SCONJ": {POS: SCONJ}, + "SYM": {POS: SYM}, + "VERB": {POS: VERB}, + "X": {POS: X}, + "adv": {POS: ADV}, "_SP": {POS: SPACE}, } diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py new file mode 100644 index 000000000..f227203cc --- /dev/null +++ b/spacy/lang/yo/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG + + +class YorubaDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "yo" + stop_words = STOP_WORDS + tokenizer_exceptions = BASE_EXCEPTIONS + + +class Yoruba(Language): + lang = "yo" + Defaults = YorubaDefaults + + +__all__ = ["Yoruba"] diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py new file mode 100644 index 000000000..170ddc803 --- /dev/null +++ b/spacy/lang/yo/examples.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.yo.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +# 1. https://yo.wikipedia.org/wiki/Wikipedia:%C3%80y%E1%BB%8Dk%C3%A0_p%C3%A0t%C3%A0k%C3%AC +# 2.https://yo.wikipedia.org/wiki/Oj%C3%BAew%C3%A9_%C3%80k%E1%BB%8D%CC%81k%E1%BB%8D%CC%81 +# 3. https://www.bbc.com/yoruba + +sentences = [ + "Ìjọba Tanzania fi Ajìjàgbara Ọmọ Orílẹ̀-èdèe Uganda sí àtìmọ́lé", + "Olúṣẹ́gun Ọbásanjọ́, tí ó jẹ́ Ààrẹ ìjọba ológun àná (láti ọdún 1976 sí 1979), tí ó sì tún ṣe Ààrẹ ìjọba alágbádá tí ìbò gbé wọlé (ní ọdún 1999 sí 2007), kúndùn láti máa bu ẹnu àtẹ́ lu àwọn " + "ètò ìjọba Ààrẹ orílẹ̀-èdè Nàìjíríà tí ó jẹ tẹ̀lé e.", + "Akin Alabi rán ẹnu mọ́ agbárá Adárí Òsìsẹ̀, àwọn ọmọ Nàìjíríà dẹnu bò ó", + "Ta ló leè dúró s'ẹ́gbẹ̀ẹ́ Okunnu láì rẹ́rìín?", + "Dídarapọ̀ mọ́n ìpolongo", + "Bi a se n so, omobinrin ni oruko ni ojo kejo bee naa ni omokunrin ni oruko ni ojo kesan.", + "Oríṣìíríṣìí nǹkan ló le yọrí sí orúkọ tí a sọ ọmọ", + "Gbogbo won ni won ni oriki ti won", +] diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py new file mode 100644 index 000000000..a9f1b85f6 --- /dev/null +++ b/spacy/lang/yo/lex_attrs.py @@ -0,0 +1,115 @@ +# coding: utf8 +from __future__ import unicode_literals + +import unicodedata + +from ...attrs import LIKE_NUM + + +_num_words = [ + "ení", + "oókàn", + "ọ̀kanlá", + "ẹ́ẹdọ́gbọ̀n", + "àádọ́fà", + "ẹ̀walélúɡba", + "egbèje", + "ẹgbàárin", + "èjì", + "eéjì", + "èjìlá", + "ọgbọ̀n,", + "ọgọ́fà", + "ọ̀ọ́dúrún", + "ẹgbẹ̀jọ", + "ẹ̀ẹ́dẹ́ɡbàárùn", + "ẹ̀ta", + "ẹẹ́ta", + "ẹ̀talá", + "aárùndílogójì", + "àádóje", + "irinwó", + "ẹgbẹ̀sàn", + "ẹgbàárùn", + "ẹ̀rin", + "ẹẹ́rin", + "ẹ̀rinlá", + "ogójì", + "ogóje", + "ẹ̀ẹ́dẹ́gbẹ̀ta", + "ẹgbàá", + "ẹgbàájọ", + "àrún", + "aárùn", + "ẹ́ẹdógún", + "àádọ́ta", + "àádọ́jọ", + "ẹgbẹ̀ta", + "ẹgboókànlá", + "ẹgbàawǎ", + "ẹ̀fà", + "ẹẹ́fà", + "ẹẹ́rìndílógún", + "ọgọ́ta", + "ọgọ́jọ", + "ọ̀ọ́dẹ́gbẹ̀rin", + "ẹgbẹ́ẹdógún", + "ọkẹ́marun", + "èje", + "etàdílógún", + "àádọ́rin", + "àádọ́sán", + "ẹgbẹ̀rin", + "ẹgbàajì", + "ẹgbẹ̀ẹgbẹ̀rún", + "ẹ̀jọ", + "ẹẹ́jọ", + "eéjìdílógún", + "ọgọ́rin", + "ọgọsàn", + "ẹ̀ẹ́dẹ́gbẹ̀rún", + "ẹgbẹ́ẹdọ́gbọ̀n", + "ọgọ́rùn ọkẹ́", + "ẹ̀sán", + "ẹẹ́sàn", + "oókàndílógún", + "àádọ́rùn", + "ẹ̀wadilúɡba", + "ẹgbẹ̀rún", + "ẹgbàáta", + "ẹ̀wá", + "ẹẹ́wàá", + "ogún", + "ọgọ́rùn", + "igba", + "ẹgbẹ̀fà", + "ẹ̀ẹ́dẹ́ɡbarin", +] + + +def strip_accents_text(text): + """ + Converts the string to NFD, separates & returns only the base characters + :param text: + :return: input string without diacritic adornments on base characters + """ + return "".join( + c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn" + ) + + +def like_num(text): + text = text.replace(",", "").replace(".", "") + num_markers = ["dí", "dọ", "lé", "dín", "di", "din", "le", "do"] + if any(mark in text for mark in num_markers): + return True + text = strip_accents_text(text) + _num_words_stripped = [strip_accents_text(num) for num in _num_words] + if text.isdigit(): + return True + if text in _num_words_stripped or text.lower() in _num_words_stripped: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/yo/stop_words.py b/spacy/lang/yo/stop_words.py new file mode 100644 index 000000000..53d382ad3 --- /dev/null +++ b/spacy/lang/yo/stop_words.py @@ -0,0 +1,12 @@ +# coding: utf8 +from __future__ import unicode_literals + +# stop words as whitespace-separated list. +# Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt + +STOP_WORDS = set( + "a an b bá bí bẹ̀rẹ̀ d e f fún fẹ́ g gbogbo i inú j jù jẹ jẹ́ k kan kì kí kò " + "l láti lè lọ m mi mo máa mọ̀ n ni náà ní nígbà nítorí nǹkan o p padà pé " + "púpọ̀ pẹ̀lú r rẹ̀ s sì sí sínú t ti tí u w wà wá wọn wọ́n y yìí à àti àwọn á " + "è é ì í ò òun ó ù ú ń ńlá ǹ ̀ ́ ̣ ṣ ṣe ṣé ṣùgbọ́n ẹ ẹmọ́ ọ ọjọ́ ọ̀pọ̀lọpọ̀".split() +) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 91daea099..8179b4551 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -4,19 +4,95 @@ from __future__ import unicode_literals from ...attrs import LANG from ...language import Language from ...tokens import Doc +from ...util import DummyTokenizer from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from .tag_map import TAG_MAP +def try_jieba_import(use_jieba): + try: + import jieba + + return jieba + except ImportError: + if use_jieba: + msg = ( + "Jieba not installed. Either set Chinese.use_jieba = False, " + "or install it https://github.com/fxsjy/jieba" + ) + raise ImportError(msg) + + +class ChineseTokenizer(DummyTokenizer): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + self.use_jieba = cls.use_jieba + self.jieba_seg = try_jieba_import(self.use_jieba) + self.tokenizer = Language.Defaults().create_tokenizer(nlp) + + def __call__(self, text): + # use jieba + if self.use_jieba: + jieba_words = list( + [x for x in self.jieba_seg.cut(text, cut_all=False) if x] + ) + words = [jieba_words[0]] + spaces = [False] + for i in range(1, len(jieba_words)): + word = jieba_words[i] + if word.isspace(): + # second token in adjacent whitespace following a + # non-space token + if spaces[-1]: + words.append(word) + spaces.append(False) + # first space token following non-space token + elif word == " " and not words[-1].isspace(): + spaces[-1] = True + # token is non-space whitespace or any whitespace following + # a whitespace token + else: + # extend previous whitespace token with more whitespace + if words[-1].isspace(): + words[-1] += word + # otherwise it's a new whitespace token + else: + words.append(word) + spaces.append(False) + else: + words.append(word) + spaces.append(False) + return Doc(self.vocab, words=words, spaces=spaces) + + # split into individual characters + words = [] + spaces = [] + for token in self.tokenizer(text): + if token.text.isspace(): + words.append(token.text) + spaces.append(False) + else: + words.extend(list(token.text)) + spaces.extend([False] * len(token.text)) + spaces[-1] = bool(token.whitespace_) + return Doc(self.vocab, words=words, spaces=spaces) + + class ChineseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "zh" - use_jieba = True tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS tag_map = TAG_MAP writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + use_jieba = True + + @classmethod + def create_tokenizer(cls, nlp=None): + return ChineseTokenizer(cls, nlp) class Chinese(Language): @@ -24,26 +100,7 @@ class Chinese(Language): Defaults = ChineseDefaults # override defaults def make_doc(self, text): - if self.Defaults.use_jieba: - try: - import jieba - except ImportError: - msg = ( - "Jieba not installed. Either set Chinese.use_jieba = False, " - "or install it https://github.com/fxsjy/jieba" - ) - raise ImportError(msg) - words = list(jieba.cut(text, cut_all=False)) - words = [x for x in words if x] - return Doc(self.vocab, words=words, spaces=[False] * len(words)) - else: - words = [] - spaces = [] - for token in self.tokenizer(text): - words.extend(list(token.text)) - spaces.extend([False] * len(token.text)) - spaces[-1] = bool(token.whitespace_) - return Doc(self.vocab, words=words, spaces=spaces) + return self.tokenizer(text) __all__ = ["Chinese"] diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 8d2f99d01..41e2d2158 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -1,11 +1,12 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PART, INTJ, PRON +from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X +from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE -# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. -# We also map the tags to the simpler Google Universal POS tag set. +# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn +# Treebank tag set. We also map the tags to the simpler Universal Dependencies +# v2 tag set. TAG_MAP = { "AS": {POS: PART}, @@ -38,10 +39,11 @@ TAG_MAP = { "OD": {POS: NUM}, "DT": {POS: DET}, "CC": {POS: CCONJ}, - "CS": {POS: CONJ}, + "CS": {POS: SCONJ}, "AD": {POS: ADV}, "JJ": {POS: ADJ}, "P": {POS: ADP}, "PN": {POS: PRON}, "PU": {POS: PUNCT}, + "_SP": {POS: SPACE}, } diff --git a/spacy/language.py b/spacy/language.py index 8ec602ed7..008b5559f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -650,7 +650,7 @@ class Language(object): kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) if not hasattr(pipe, "pipe"): - examples = _pipe(pipe, examples, kwargs) + examples = _pipe(examples, pipe, kwargs) else: examples = pipe.pipe(examples, as_example=True, **kwargs) for ex in examples: diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6f6848102..30ef3dd36 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store): value = string_store.add(value) elif isinstance(value, bool): value = int(value) - elif isinstance(value, (dict, int)): + elif isinstance(value, int): + pass + elif isinstance(value, dict): continue else: raise ValueError(Errors.E153.format(vtype=type(value).__name__)) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index d926b987b..205697637 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -292,13 +292,14 @@ class EntityRuler(object): self.add_patterns(patterns) else: cfg = {} - deserializers = { + deserializers_patterns = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl")) - ), - "cfg": lambda p: cfg.update(srsly.read_json(p)), + )} + deserializers_cfg = { + "cfg": lambda p: cfg.update(srsly.read_json(p)) } - from_disk(path, deserializers, {}) + from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) @@ -307,6 +308,7 @@ class EntityRuler(object): self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr ) + from_disk(path, deserializers_patterns, {}) return self def to_disk(self, path, **kwargs): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 110839acd..b041e2441 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -13,7 +13,6 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module -from spacy.gold import Example from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -24,6 +23,8 @@ from ..vocab cimport Vocab from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj +from ..gold import Example +from ..compat import basestring_ from ..attrs import POS, ID from ..parts_of_speech import X from ..kb import KnowledgeBase @@ -593,6 +594,8 @@ class Tagger(Pipe): return build_tagger_model(n_tags, **cfg) def add_label(self, label, values=None): + if not isinstance(label, basestring_): + raise ValueError(Errors.E187) if label in self.labels: return 0 if self.model not in (True, False, None): @@ -1238,6 +1241,8 @@ class TextCategorizer(Pipe): return float(mean_square_error), d_scores def add_label(self, label): + if not isinstance(label, basestring_): + raise ValueError(Errors.E187) if label in self.labels: return 0 if self.model not in (None, True, False): @@ -1358,7 +1363,7 @@ cdef class EntityRecognizer(Parser): @component( "entity_linker", - requires=["doc.ents", "token.ent_iob", "token.ent_type"], + requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"] ) class EntityLinker(Pipe): @@ -1429,13 +1434,20 @@ class EntityLinker(Pipe): for entity, kb_dict in gold.links.items(): start, end = entity mention = doc.text[start:end] + # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt + if not (start, end) in ents_by_offset: + raise RuntimeError(Errors.E188) ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances - we assume there is at least 1 per doc/gold if value: - sentence_docs.append(ent.sent.as_doc()) + try: + sentence_docs.append(ent.sent.as_doc()) + except AttributeError: + # Catch the exception when ent.sent is None and provide a user-friendly warning + raise RuntimeError(Errors.E030) sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) @@ -1523,7 +1535,7 @@ class EntityLinker(Pipe): if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - because they might not get a KB ID - for sent in doc.ents: + for sent in doc.sents: sent_doc = sent.as_doc() # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model([sent_doc])[0] @@ -1704,6 +1716,55 @@ class Sentencizer(Pipe): return example return doc + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): + docs = list(docs) + tag_ids = self.predict(docs) + self.set_annotations(docs, tag_ids) + yield from docs + + def predict(self, docs): + """Apply the pipeline's model to a batch of docs, without + modifying them. + """ + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + guesses = [[] for doc in docs] + return guesses + guesses = [] + for doc in docs: + start = 0 + seen_period = False + doc_guesses = [False] * len(doc) + doc_guesses[0] = True + for i, token in enumerate(doc): + is_in_punct_chars = token.text in self.punct_chars + if seen_period and not token.is_punct and not is_in_punct_chars: + doc_guesses[start] = True + start = token.i + seen_period = False + elif is_in_punct_chars: + seen_period = True + if start < len(doc): + doc_guesses[start] = True + guesses.append(doc_guesses) + return guesses + + def set_annotations(self, docs, batch_tag_ids, tensors=None): + if isinstance(docs, Doc): + docs = [docs] + cdef Doc doc + cdef int idx = 0 + for i, doc in enumerate(docs): + doc_tag_ids = batch_tag_ids[i] + for j, tag_id in enumerate(doc_tag_ids): + # Don't clobber existing sentence boundaries + if doc.c[j].sent_start == 0: + if tag_id: + doc.c[j].sent_start = 1 + else: + doc.c[j].sent_start = -1 + def to_bytes(self, **kwargs): """Serialize the sentencizer to a bytestring. diff --git a/spacy/scorer.py b/spacy/scorer.py index d2878da1a..7fee4865a 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -269,7 +269,9 @@ class Scorer(object): gold_tags = set() gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, head, dep, sent_start in zip(orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts): + for id_, tag, head, dep, sent_start in zip( + orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts + ): gold_tags.add((id_, tag)) if sent_start: gold_sent_starts.add(id_) @@ -308,8 +310,10 @@ class Scorer(object): self.labelled_per_dep[token.dep_.lower()] = PRFScore() if token.dep_.lower() not in cand_deps_per_dep: cand_deps_per_dep[token.dep_.lower()] = set() - cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower())) - if "-" not in orig.entities: + cand_deps_per_dep[token.dep_.lower()].add( + (gold_i, gold_head, token.dep_.lower()) + ) + if "-" not in [token[-1] for token in gold.orig_annot]: # Find all NER labels in gold and doc ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) # Set up all labels for per type scoring and prepare gold per type @@ -342,7 +346,9 @@ class Scorer(object): self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: - self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())) + self.labelled_per_dep[dep].score_set( + cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()) + ) self.unlabelled.score_set( set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) ) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index dc482f278..b1085c762 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -69,7 +69,8 @@ cdef class ParserBeam(object): cdef StateC* st for state in states: beam = Beam(self.moves.n_moves, width, min_density=density) - beam.initialize(self.moves.init_beam_state, state.c.length, + beam.initialize(self.moves.init_beam_state, + self.moves.del_beam_state, state.c.length, state.c._sent) for i in range(beam.width): st = beam.at(i) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 77bd43ed7..8b6448a46 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -42,11 +42,17 @@ cdef WeightsC get_c_weights(model) except *: cdef precompute_hiddens state2vec = model.state2vec output.feat_weights = state2vec.get_feat_weights() output.feat_bias = state2vec.bias.data - cdef np.ndarray vec2scores_W = model.vec2scores.W - cdef np.ndarray vec2scores_b = model.vec2scores.b + cdef np.ndarray vec2scores_W + cdef np.ndarray vec2scores_b + if model.vec2scores is None: + output.hidden_weights = NULL + output.hidden_bias = NULL + else: + vec2scores_W = model.vec2scores.W + vec2scores_b = model.vec2scores.b + output.hidden_weights = vec2scores_W.data + output.hidden_bias = vec2scores_b.data cdef np.ndarray class_mask = model._class_mask - output.hidden_weights = vec2scores_W.data - output.hidden_bias = vec2scores_b.data output.seen_classes = class_mask.data return output @@ -54,7 +60,10 @@ cdef WeightsC get_c_weights(model) except *: cdef SizesC get_c_sizes(model, int batch_size) except *: cdef SizesC output output.states = batch_size - output.classes = model.vec2scores.nO + if model.vec2scores is None: + output.classes = model.state2vec.nO + else: + output.classes = model.vec2scores.nO output.hiddens = model.state2vec.nO output.pieces = model.state2vec.nP output.feats = model.state2vec.nF @@ -105,11 +114,12 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil: cdef void predict_states(ActivationsC* A, StateC** states, const WeightsC* W, SizesC n) nogil: + cdef double one = 1.0 resize_activations(A, n) - memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) - memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) for i in range(n.states): states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) + memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) + memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) sum_state_features(A.unmaxed, W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces) for i in range(n.states): @@ -120,18 +130,20 @@ cdef void predict_states(ActivationsC* A, StateC** states, which = Vec.arg_max(&A.unmaxed[index], n.pieces) A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which] memset(A.scores, 0, n.states * n.classes * sizeof(float)) - cdef double one = 1.0 - # Compute hidden-to-output - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, - n.states, n.classes, n.hiddens, one, - A.hiddens, n.hiddens, 1, - W.hidden_weights, n.hiddens, 1, - one, - A.scores, n.classes, 1) - # Add bias - for i in range(n.states): - VecVec.add_i(&A.scores[i*n.classes], - W.hidden_bias, 1., n.classes) + if W.hidden_weights == NULL: + memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float)) + else: + # Compute hidden-to-output + blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, + n.states, n.classes, n.hiddens, one, + A.hiddens, n.hiddens, 1, + W.hidden_weights, n.hiddens, 1, + one, + A.scores, n.classes, 1) + # Add bias + for i in range(n.states): + VecVec.add_i(&A.scores[i*n.classes], + W.hidden_bias, 1., n.classes) # Set unseen classes to minimum value i = 0 min_ = A.scores[0] @@ -219,7 +231,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserModel(Model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): Model.__init__(self) - self._layers = [tok2vec, lower_model, upper_model] + self._layers = [tok2vec, lower_model] + if upper_model is not None: + self._layers.append(upper_model) self.unseen_classes = set() if unseen_classes: for class_ in unseen_classes: @@ -234,6 +248,8 @@ class ParserModel(Model): return step_model, finish_parser_update def resize_output(self, new_output): + if len(self._layers) == 2: + return if new_output == self.upper.nO: return smaller = self.upper @@ -275,12 +291,24 @@ class ParserModel(Model): class ParserStepModel(Model): def __init__(self, docs, layers, unseen_classes=None, drop=0.): self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop) + if layers[1].nP >= 2: + activation = "maxout" + elif len(layers) == 2: + activation = None + else: + activation = "relu" self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], - drop=drop) - self.vec2scores = layers[-1] - self.cuda_stream = util.get_cuda_stream() + activation=activation, drop=drop) + if len(layers) == 3: + self.vec2scores = layers[-1] + else: + self.vec2scores = None + self.cuda_stream = util.get_cuda_stream(non_blocking=True) self.backprops = [] - self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f') + if self.vec2scores is None: + self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f') + else: + self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f') self._class_mask.fill(1) if unseen_classes is not None: for class_ in unseen_classes: @@ -302,10 +330,15 @@ class ParserStepModel(Model): def begin_update(self, states, drop=0.): token_ids = self.get_token_ids(states) vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) - mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) - if mask is not None: - vector *= mask - scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) + if self.vec2scores is not None: + mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) + if mask is not None: + vector *= mask + scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) + else: + scores = NumpyOps().asarray(vector) + get_d_vector = lambda d_scores, sgd=None: d_scores + mask = None # If the class is unseen, make sure its score is minimum scores[:, self._class_mask == 0] = numpy.nanmin(scores) @@ -342,12 +375,12 @@ class ParserStepModel(Model): return ids def make_updates(self, sgd): - # Tells CUDA to block, so our async copies complete. - if self.cuda_stream is not None: - self.cuda_stream.synchronize() # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) + # Tells CUDA to block, so our async copies complete. + if self.cuda_stream is not None: + self.cuda_stream.synchronize() for ids, d_vector, bp_vector in self.backprops: d_state_features = bp_vector((d_vector, ids), sgd=sgd) ids = ids.flatten() @@ -385,9 +418,10 @@ cdef class precompute_hiddens: cdef np.ndarray bias cdef object _cuda_stream cdef object _bp_hiddens + cdef object activation def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - drop=0.): + activation="maxout", drop=0.): gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) cdef np.ndarray cached if not isinstance(gpu_cached, numpy.ndarray): @@ -405,6 +439,8 @@ cdef class precompute_hiddens: self.nP = getattr(lower_model, 'nP', 1) self.nO = cached.shape[2] self.ops = lower_model.ops + assert activation in (None, "relu", "maxout") + self.activation = activation self._is_synchronized = False self._cuda_stream = cuda_stream self._cached = cached @@ -417,7 +453,7 @@ cdef class precompute_hiddens: return self._cached.data def __call__(self, X): - return self.begin_update(X)[0] + return self.begin_update(X, drop=None)[0] def begin_update(self, token_ids, drop=0.): cdef np.ndarray state_vector = numpy.zeros( @@ -450,28 +486,35 @@ cdef class precompute_hiddens: else: ops = CupyOps() - if self.nP == 1: - state_vector = state_vector.reshape(state_vector.shape[:-1]) - mask = state_vector >= 0. - state_vector *= mask - else: + if self.activation == "maxout": state_vector, mask = ops.maxout(state_vector) + else: + state_vector = state_vector.reshape(state_vector.shape[:-1]) + if self.activation == "relu": + mask = state_vector >= 0. + state_vector *= mask + else: + mask = None def backprop_nonlinearity(d_best, sgd=None): if isinstance(d_best, numpy.ndarray): ops = NumpyOps() else: ops = CupyOps() - mask_ = ops.asarray(mask) - + if mask is not None: + mask_ = ops.asarray(mask) # This will usually be on GPU d_best = ops.asarray(d_best) # Fix nans (which can occur from unseen classes.) d_best[ops.xp.isnan(d_best)] = 0. - if self.nP == 1: + if self.activation == "maxout": + mask_ = ops.asarray(mask) + return ops.backprop_maxout(d_best, mask_, self.nP) + elif self.activation == "relu": + mask_ = ops.asarray(mask) d_best *= mask_ d_best = d_best.reshape((d_best.shape + (1,))) return d_best else: - return ops.backprop_maxout(d_best, mask_, self.nP) + return d_best.reshape((d_best.shape + (1,))) return state_vector, backprop_nonlinearity diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 65c0a3b4d..141d796a4 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -100,10 +100,30 @@ cdef cppclass StateC: free(this.shifted - PADDING) void set_context_tokens(int* ids, int n) nogil: - if n == 2: + if n == 1: + if this.B(0) >= 0: + ids[0] = this.B(0) + else: + ids[0] = -1 + elif n == 2: ids[0] = this.B(0) ids[1] = this.S(0) - if n == 8: + elif n == 3: + if this.B(0) >= 0: + ids[0] = this.B(0) + else: + ids[0] = -1 + # First word of entity, if any + if this.entity_is_open(): + ids[1] = this.E(0) + else: + ids[1] = -1 + # Last word of entity, if within entity + if ids[0] == -1 or ids[1] == -1: + ids[2] = -1 + else: + ids[2] = ids[0] - 1 + elif n == 8: ids[0] = this.B(0) ids[1] = this.B(1) ids[2] = this.S(0) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index d358c1277..45fd1170b 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: return st +cdef int _del_state(Pool mem, void* state, void* x) except -1: + cdef StateC* st = state + del st + + cdef class ArcEager(TransitionSystem): def __init__(self, *args, **kwargs): TransitionSystem.__init__(self, *args, **kwargs) self.init_beam_state = _init_state + self.del_beam_state = _del_state @classmethod def get_actions(cls, **kwargs): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 073851d8a..c98baf6fd 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -22,7 +22,7 @@ from thinc.extra.search cimport Beam from thinc.api import chain, clone from thinc.v2v import Model, Maxout, Affine from thinc.misc import LayerNorm -from thinc.neural.ops import CupyOps +from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly @@ -62,13 +62,17 @@ cdef class Parser: t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) - if depth != 1: + nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature) + if depth not in (0, 1): raise ValueError(TempErrors.T004.format(value=depth)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 96)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) + if depth == 0: + hidden_width = nr_class + parser_maxout_pieces = 1 embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) pretrained_vectors = cfg.get('pretrained_vectors', None) tok2vec = Tok2Vec(token_vector_width, embed_size, @@ -81,16 +85,19 @@ cdef class Parser: tok2vec = chain(tok2vec, flatten) tok2vec.nO = token_vector_width lower = PrecomputableAffine(hidden_width, - nF=cls.nr_feature, nI=token_vector_width, + nF=nr_feature_tokens, nI=token_vector_width, nP=parser_maxout_pieces) lower.nP = parser_maxout_pieces - - with Model.use_device('cpu'): - upper = Affine(nr_class, hidden_width, drop_factor=0.0) - upper.W *= 0 + if depth == 1: + with Model.use_device('cpu'): + upper = Affine(nr_class, hidden_width, drop_factor=0.0) + upper.W *= 0 + else: + upper = None cfg = { 'nr_class': nr_class, + 'nr_feature_tokens': nr_feature_tokens, 'hidden_depth': depth, 'token_vector_width': token_vector_width, 'hidden_width': hidden_width, @@ -134,6 +141,7 @@ cdef class Parser: if 'beam_update_prob' not in cfg: cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0) cfg.setdefault('cnn_maxout_pieces', 3) + cfg.setdefault("nr_feature_tokens", self.nr_feature) self.cfg = cfg self.model = model self._multitasks = [] @@ -308,7 +316,7 @@ cdef class Parser: token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), dtype='i', order='C') cdef int* c_ids - cdef int nr_feature = self.nr_feature + cdef int nr_feature = self.cfg["nr_feature_tokens"] cdef int n_states model = self.model(docs) todo = [beam for beam in beams if not beam.is_done] @@ -512,7 +520,7 @@ cdef class Parser: new_golds.append(gold) model, finish_update = self.model.begin_update(docs, drop=drop) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.nr_feature, 10000, states, new_golds, model.state2vec, + self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec, model.vec2scores, width, drop=drop, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 45d9a787f..a5fe55918 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL +ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1 + cdef class TransitionSystem: cdef Pool mem cdef StringStore strings @@ -42,6 +44,7 @@ cdef class TransitionSystem: cdef public attr_t root_label cdef public freqs cdef init_state_t init_beam_state + cdef del_state_t del_beam_state cdef public object labels cdef int initialize_state(self, StateC* state) nogil diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 7876813e0..65097f114 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: return st +cdef int _del_state(Pool mem, void* state, void* x) except -1: + cdef StateC* st = state + del st + + cdef class TransitionSystem: def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None): self.mem = Pool() @@ -44,6 +49,7 @@ cdef class TransitionSystem: self.initialize_actions(labels_by_action, min_freq=min_freq) self.root_label = self.strings.add('ROOT') self.init_beam_state = _init_state + self.del_beam_state = _del_state def __reduce__(self): return (self.__class__, (self.strings, self.labels), None, None) @@ -72,7 +78,8 @@ cdef class TransitionSystem: for doc in docs: beam = Beam(self.n_moves, beam_width, min_density=beam_density) - beam.initialize(self.init_beam_state, doc.length, doc.c) + beam.initialize(self.init_beam_state, self.del_beam_state, + doc.length, doc.c) for i in range(beam.width): state = beam.at(i) state.offset = offset diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b0d373c42..1a33221c2 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -125,7 +125,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("MeCab") + pytest.importorskip("fugashi") return get_lang_class("ja").Defaults.create_tokenizer() @@ -218,3 +218,15 @@ def uk_tokenizer(): @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def yo_tokenizer(): + return get_lang_class("yo").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def zh_tokenizer(): + pytest.importorskip("jieba") + return get_lang_class("zh").Defaults.create_tokenizer() + diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 6c41a59be..d074fddc6 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -183,3 +183,18 @@ def test_doc_retokenizer_split_lex_attrs(en_vocab): retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) assert doc[0].is_stop assert not doc[1].is_stop + + +def test_doc_retokenizer_realloc(en_vocab): + """#4604: realloc correctly when new tokens outnumber original tokens""" + text = "Hyperglycemic adverse events following antipsychotic drug administration in the" + doc = Doc(en_vocab, words=text.split()[:-1]) + with doc.retokenize() as retokenizer: + token = doc[0] + heads = [(token, 0)] * len(token) + retokenizer.split(doc[token.i], list(token.text), heads=heads) + doc = Doc(en_vocab, words=text.split()) + with doc.retokenize() as retokenizer: + token = doc[0] + heads = [(token, 0)] * len(token) + retokenizer.split(doc[token.i], list(token.text), heads=heads) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index f813a9743..01bb93c50 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer): return doc +@pytest.mark.parametrize( + "i_sent,i,j,text", + [ + (0, 0, len("This is a"), "This is a"), + (1, 0, len("This is another"), "This is another"), + (2, len("And "), len("And ") + len("a third"), "a third"), + (0, 1, 2, None), + ], +) +def test_char_span(doc, i_sent, i, j, text): + sents = list(doc.sents) + span = sents[i_sent].char_span(i, j) + if not text: + assert not span + else: + assert span.text == text + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index fdac32a90..7f939011f 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import re from spacy.lang.en import English from spacy.tokenizer import Tokenizer from spacy.util import compile_prefix_regex, compile_suffix_regex @@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab): r"[\[\]!&:,()\*—–\/-]", ] infix_re = compile_infix_regex(custom_infixes) + token_match_re = re.compile("a-b") return Tokenizer( en_vocab, English.Defaults.tokenizer_exceptions, prefix_re.search, suffix_re.search, infix_re.finditer, - token_match=None, + token_match=token_match_re.match, ) @@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer): "Megaregion", ".", ] + + +def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion." + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "a-b", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ] + + +def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)" + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ":)", + ] + + +def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)" + rules = custom_en_tokenizer.rules + del rules[":)"] + custom_en_tokenizer.rules = rules + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ":", + ")", + ] diff --git a/spacy/tests/lang/fi/test_text.py b/spacy/tests/lang/fi/test_text.py new file mode 100644 index 000000000..2dd92597e --- /dev/null +++ b/spacy/tests/lang/fi/test_text.py @@ -0,0 +1,27 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10000", True), + ("10,00", True), + ("-999,0", True), + ("yksi", True), + ("kolmetoista", True), + ("viisikymmentä", True), + ("tuhat", True), + ("1/2", True), + ("hevonen", False), + (",", False), + ], +) +def test_fi_lex_attrs_like_number(fi_tokenizer, text, match): + tokens = fi_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 66be7bd46..cbbebcf28 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -12,9 +12,23 @@ ABBREVIATION_TESTS = [ ("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]), ] +HYPHENATED_TESTS = [ + ( + "1700-luvulle sijoittuva taide-elokuva", + ["1700-luvulle", "sijoittuva", "taide-elokuva"] + ) +] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) -def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens): +def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens", HYPHENATED_TESTS) +def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens): tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index ca38c2c38..57541fc26 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -3,8 +3,24 @@ from __future__ import unicode_literals import pytest - @pytest.mark.parametrize("text", ["z.B.", "Jan."]) def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): tokens = lb_tokenizer(text) assert len(tokens) == 1 + +@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"]) +def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): + tokens = lb_tokenizer(text) + assert len(tokens) == 2 + +def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): + text = "Mee 't ass net evident, d'Liewen." + tokens = lb_tokenizer(text) + assert len(tokens) == 9 + assert tokens[1].text == "'t" + assert tokens[1].lemma_ == "et" + +@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) +def test_lb_norm_exceptions(lb_tokenizer, text, norm): + tokens = lb_tokenizer(text) + assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py index 10cb16a41..2284ff794 100644 --- a/spacy/tests/lang/lb/test_text.py +++ b/spacy/tests/lang/lb/test_text.py @@ -5,18 +5,10 @@ import pytest def test_lb_tokenizer_handles_long_text(lb_tokenizer): - text = """Den Nordwand an d'Sonn - -An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum. Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.", - -Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet. Um Enn huet den Nordwand säi Kampf opginn. - -Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen. - -Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier.""" + text = """Den Nordwand an d'Sonn An der Zäit hunn sech den Nordwand an d'Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum. Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen. Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet. Um Enn huet den Nordwand säi Kampf opginn. Dunn huet d'Sonn d'Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen. Do huet den Nordwand missen zouginn, dass d'Sonn vun hinnen zwee de Stäerkste wier.""" tokens = lb_tokenizer(text) - assert len(tokens) == 143 + assert len(tokens) == 142 @pytest.mark.parametrize( @@ -24,6 +16,7 @@ Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste [ ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), + ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14) ], ) def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 4bb5aac70..ff630f0fa 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match): ], ) def test_lex_attrs_word_shape(text, shape): - assert word_shape(text) == shape + assert word_shape(text) == shape \ No newline at end of file diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 9b01340e3..5c701fc22 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -11,7 +11,7 @@ from spacy.util import get_lang_class LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"] + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo'] # fmt: on diff --git a/spacy/tests/lang/yo/__init__.py b/spacy/tests/lang/yo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py new file mode 100644 index 000000000..ce6408b67 --- /dev/null +++ b/spacy/tests/lang/yo/test_text.py @@ -0,0 +1,32 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.yo.lex_attrs import like_num + + +def test_yo_tokenizer_handles_long_text(yo_tokenizer): + text = """Àwọn ọmọ ìlú tí wọ́n ń ṣàmúlò ayélujára ti bẹ̀rẹ̀ ìkọkúkọ sórí àwòrán ààrẹ Nkurunziza nínú ìfẹ̀hónúhàn pẹ̀lú àmì ìdámọ̀: Nkurunziza àti Burundi: + Ọmọ ilé ẹ̀kọ́ gíga ní ẹ̀wọ̀n fún kíkọ ìkọkúkọ sí orí àwòrán Ààrẹ . + Bí mo bá ṣe èyí ní Burundi , ó ṣe é ṣe kí a fi mí sí àtìmọ́lé + Ìjọba Burundi fi akẹ́kọ̀ọ́bìnrin sí àtìmọ́lé látàrí ẹ̀sùn ìkọkúkọ sí orí àwòrán ààrẹ. A túwíìtì àwòrán ìkọkúkọ wa ní ìbánikẹ́dùn ìṣẹ̀lẹ̀ náà. + Wọ́n ní kí a dán an wò, kí a kọ nǹkan sí orí àwòrán ààrẹ mo sì ṣe bẹ́ẹ̀. Mo ní ìgbóyà wípé ẹnikẹ́ni kò ní mú mi níbí. + Ìfòfinlíle mú àtakò""" + tokens = yo_tokenizer(text) + assert len(tokens) == 121 + + +@pytest.mark.parametrize( + "text,match", + [("ení", True), ("ogun", True), ("mewadinlogun", True), ("ten", False)], +) +def test_lex_attrs_like_number(yo_tokenizer, text, match): + tokens = yo_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["eji", "ejila", "ogun", "aárùn"]) +def test_yo_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/zh/__init__.py b/spacy/tests/lang/zh/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py new file mode 100644 index 000000000..235f597a5 --- /dev/null +++ b/spacy/tests/lang/zh/test_text.py @@ -0,0 +1,25 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("999.0", True), + ("一", True), + ("二", True), + ("〇", True), + ("十一", True), + ("狗", False), + (",", False), + ], +) +def test_lex_attrs_like_number(zh_tokenizer, text, match): + tokens = zh_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py new file mode 100644 index 000000000..36d94beb5 --- /dev/null +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +# fmt: off +TOKENIZER_TESTS = [ + ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。", + ['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多', + '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做', + '为', '母语', '。']), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_zh_tokenizer(zh_tokenizer, text, expected_tokens): + zh_tokenizer.use_jieba = False + tokens = [token.text for token in zh_tokenizer(text)] + assert tokens == list(text) + + zh_tokenizer.use_jieba = True + tokens = [token.text for token in zh_tokenizer(text)] + assert tokens == expected_tokens + + +def test_extra_spaces(zh_tokenizer): + # note: three spaces after "I" + tokens = zh_tokenizer("I like cheese.") + assert tokens[1].orth_ == " " diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d05403891..8329391ca 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -259,6 +259,27 @@ def test_block_ner(): assert [token.ent_type_ for token in doc] == expected_types +def test_change_number_features(): + # Test the default number features + nlp = English() + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + ner.add_label("PERSON") + nlp.begin_training() + assert ner.model.lower.nF == ner.nr_feature + # Test we can change it + nlp = English() + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + ner.add_label("PERSON") + nlp.begin_training( + component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}} + ) + assert ner.model.lower.nF == 3 + # Test the model runs + nlp("hello world") + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index c140cb485..384f14dad 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -148,3 +148,20 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): assert tokens[4].left_edge.i == 0 assert tokens[4].right_edge.i == 4 assert tokens[4].head.i == 4 + + +def test_parser_set_sent_starts(en_vocab): + words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] + heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] + deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] + doc = get_doc( + en_vocab, words=words, deps=deps, heads=heads + ) + for i in range(len(words)): + if i == 0 or i == 3: + assert doc[i].is_sent_start == True + else: + assert doc[i].is_sent_start == None + for sent in doc.sents: + for token in sent: + assert token.head in sent diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index d91fdd198..359552c5b 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -5,6 +5,7 @@ import pytest import spacy from spacy.pipeline import Sentencizer from spacy.tokens import Doc +from spacy.lang.en import English def test_sentencizer(en_vocab): @@ -17,6 +18,17 @@ def test_sentencizer(en_vocab): assert len(list(doc.sents)) == 2 +def test_sentencizer_pipe(): + texts = ["Hello! This is a test.", "Hi! This is a test."] + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + for doc in nlp.pipe(texts): + assert doc.is_sentenced + sent_starts = [t.is_sent_start for t in doc] + assert sent_starts == [True, False, True, False, False, False, False] + assert len(list(doc.sents)) == 2 + + @pytest.mark.parametrize( "words,sent_starts,n_sents", [ diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py new file mode 100644 index 000000000..d0331602c --- /dev/null +++ b/spacy/tests/pipeline/test_tagger.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.language import Language +from spacy.pipeline import Tagger + + +def test_label_types(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.get_pipe("tagger").add_label("A") + with pytest.raises(ValueError): + nlp.get_pipe("tagger").add_label(9) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index e967fffaf..44834c2a8 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -62,3 +62,11 @@ def test_textcat_learns_multilabel(): assert score < 0.5 else: assert score > 0.5 + + +def test_label_types(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("textcat")) + nlp.get_pipe("textcat").add_label("answer") + with pytest.raises(ValueError): + nlp.get_pipe("textcat").add_label(9) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index b883ae67a..d05759c31 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -177,7 +177,6 @@ def test_issue3328(en_vocab): assert matched_texts == ["Hello", "how", "you", "doing"] -@pytest.mark.xfail def test_issue3331(en_vocab): """Test that duplicate patterns for different rules result in multiple matches, one per rule. @@ -328,6 +327,7 @@ def test_issue3449(): assert t3[5].text == "I" +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3456(): # this crashed because of a padding error in layer.ops.unflatten in thinc nlp = English() diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index 6de373f11..c060473f5 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -2,8 +2,10 @@ from __future__ import unicode_literals from spacy.lang.en import English +import pytest +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3880(): """Test that `nlp.pipe()` works when an empty string ends the batch. diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index b0583f717..484d5d280 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals from spacy.lang.en import English from spacy.util import minibatch, compounding +import pytest +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue4348(): """Test that training the tagger with empty data, doesn't throw errors""" diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index bf103a389..89332ca2f 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -3,9 +3,9 @@ from __future__ import unicode_literals import srsly from spacy.gold import GoldCorpus - from spacy.lang.en import English -from spacy.tests.util import make_tempdir + +from ..util import make_tempdir def test_issue4402(): diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 6a43dfea9..8ec9a0bd1 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest from mock import Mock from spacy.matcher import DependencyMatcher from ..util import get_doc @@ -11,8 +10,14 @@ def test_issue4590(en_vocab): """Test that matches param in on_match method are the same as matches run with no on_match method""" pattern = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, ] on_match = Mock() @@ -23,12 +28,11 @@ def test_issue4590(en_vocab): text = "The quick brown fox jumped over the lazy fox" heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] - + doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) - + matches = matcher(doc) - + on_match_args = on_match.call_args assert on_match_args[0][3] == matches - diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py new file mode 100644 index 000000000..eb49f4a38 --- /dev/null +++ b/spacy/tests/regression/test_issue4651.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from spacy.lang.en import English +from spacy.pipeline import EntityRuler + +from ..util import make_tempdir + + +def test_issue4651_with_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + specified. + """ + text = "Spacy is a python library for nlp" + + nlp = English() + ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + + assert res == res_reloaded + + +def test_issue4651_without_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + not specified. + """ + text = "Spacy is a python library for nlp" + + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + + assert res == res_reloaded diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py new file mode 100644 index 000000000..36e9f02c1 --- /dev/null +++ b/spacy/tests/regression/test_issue4674.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from spacy.kb import KnowledgeBase +from spacy.util import ensure_path + +from spacy.lang.en import English +from spacy.tests.util import make_tempdir + + +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) + + assert kb.get_size_entities() == 1 + + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.dump(str(file_path)) + + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + kb2.load_bulk(str(file_path)) + + assert kb2.get_size_entities() == 1 + diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py new file mode 100644 index 000000000..e710881d7 --- /dev/null +++ b/spacy/tests/regression/test_issue4707.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.util import load_model_from_path +from spacy.lang.en import English + +from ..util import make_tempdir + + +def test_issue4707(): + """Tests that disabled component names are also excluded from nlp.from_disk + by default when loading a model. + """ + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(nlp.create_pipe("entity_ruler")) + assert nlp.pipe_names == ["sentencizer", "entity_ruler"] + exclude = ["tokenizer", "sentencizer"] + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir, exclude=exclude) + new_nlp = load_model_from_path(tmpdir, disable=exclude) + assert "sentencizer" not in new_nlp.pipe_names + assert "entity_ruler" in new_nlp.pipe_names diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 87b087760..ef2b1ee89 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -24,6 +24,7 @@ def test_serialize_empty_doc(en_vocab): def test_serialize_doc_roundtrip_bytes(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) + doc.cats = {"A": 0.5} doc_b = doc.to_bytes() new_doc = Doc(en_vocab).from_bytes(doc_b) assert new_doc.to_bytes() == doc_b @@ -66,12 +67,17 @@ def test_serialize_doc_exclude(en_vocab): def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] + cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): + doc.cats = cats doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) - list(doc_bin.get_docs(nlp.vocab)) + reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) + for i, doc in enumerate(reloaded_docs): + assert doc.text == texts[i] + assert doc.cats == cats diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 2b0bcc15e..102b87142 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -65,6 +65,20 @@ def test_language_evaluate(nlp): nlp.evaluate([text, gold]) +def test_evaluate_no_pipe(nlp): + """Test that docs are processed correctly within Language.pipe if the + component doesn't expose a .pipe method.""" + + def pipe(doc): + return doc + + text = "hello world" + annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} + nlp = Language(Vocab()) + nlp.add_pipe(pipe) + nlp.evaluate([(text, annots)]) + + def vector_modification_pipe(doc): doc.vector += 1 return doc diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 92a607e5b..888028b6c 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -12,8 +12,22 @@ from .util import get_doc test_las_apple = [ [ "Apple is looking at buying U.K. startup for $ 1 billion", - {"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7], - "deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']}, + { + "heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7], + "deps": [ + "nsubj", + "aux", + "ROOT", + "prep", + "pcomp", + "compound", + "dobj", + "prep", + "quantmod", + "compound", + "pobj", + ], + }, ] ] @@ -59,7 +73,7 @@ def test_las_per_type(en_vocab): en_vocab, words=input_.split(" "), heads=([h - i for i, h in enumerate(annot["heads"])]), - deps=annot["deps"] + deps=annot["deps"], ) gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) doc[0].dep_ = "compound" diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py new file mode 100644 index 000000000..2d71588cc --- /dev/null +++ b/spacy/tests/tokenizer/test_explain.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.util import get_lang_class + +# Only include languages with no external dependencies +# "is" seems to confuse importlib, so we're also excluding it for now +# excluded: ja, ru, th, uk, vi, zh, is +LANGUAGES = [ + pytest.param("fr", marks=pytest.mark.slow()), + pytest.param("af", marks=pytest.mark.slow()), + pytest.param("ar", marks=pytest.mark.slow()), + pytest.param("bg", marks=pytest.mark.slow()), + "bn", + pytest.param("ca", marks=pytest.mark.slow()), + pytest.param("cs", marks=pytest.mark.slow()), + pytest.param("da", marks=pytest.mark.slow()), + pytest.param("de", marks=pytest.mark.slow()), + "el", + "en", + pytest.param("es", marks=pytest.mark.slow()), + pytest.param("et", marks=pytest.mark.slow()), + pytest.param("fa", marks=pytest.mark.slow()), + pytest.param("fi", marks=pytest.mark.slow()), + "fr", + pytest.param("ga", marks=pytest.mark.slow()), + pytest.param("he", marks=pytest.mark.slow()), + pytest.param("hi", marks=pytest.mark.slow()), + pytest.param("hr", marks=pytest.mark.slow()), + "hu", + pytest.param("id", marks=pytest.mark.slow()), + pytest.param("it", marks=pytest.mark.slow()), + pytest.param("kn", marks=pytest.mark.slow()), + pytest.param("lb", marks=pytest.mark.slow()), + pytest.param("lt", marks=pytest.mark.slow()), + pytest.param("lv", marks=pytest.mark.slow()), + pytest.param("nb", marks=pytest.mark.slow()), + pytest.param("nl", marks=pytest.mark.slow()), + "pl", + pytest.param("pt", marks=pytest.mark.slow()), + pytest.param("ro", marks=pytest.mark.slow()), + pytest.param("si", marks=pytest.mark.slow()), + pytest.param("sk", marks=pytest.mark.slow()), + pytest.param("sl", marks=pytest.mark.slow()), + pytest.param("sq", marks=pytest.mark.slow()), + pytest.param("sr", marks=pytest.mark.slow()), + pytest.param("sv", marks=pytest.mark.slow()), + pytest.param("ta", marks=pytest.mark.slow()), + pytest.param("te", marks=pytest.mark.slow()), + pytest.param("tl", marks=pytest.mark.slow()), + pytest.param("tr", marks=pytest.mark.slow()), + pytest.param("tt", marks=pytest.mark.slow()), + pytest.param("ur", marks=pytest.mark.slow()), +] + + +@pytest.mark.parametrize("lang", LANGUAGES) +def test_tokenizer_explain(lang): + tokenizer = get_lang_class(lang).Defaults.create_tokenizer() + examples = pytest.importorskip("spacy.lang.{}.examples".format(lang)) + for sentence in examples.sentences: + tokens = [t.text for t in tokenizer(sentence) if not t.is_space] + debug_tokens = [t[1] for t in tokenizer.explain(sentence)] + assert tokens == debug_tokens diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index a1017bac8..e2c0e3de8 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -57,10 +57,8 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() - ), + "http://foo.com/blah_blah_(wikipedia)", + "http://foo.com/blah_blah_(wikipedia)_(again)", pytest.param("http://⌘.ws", marks=pytest.mark.xfail()), pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()), pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()), @@ -107,8 +105,8 @@ URLS_SHOULD_NOT_MATCH = [ "NASDAQ:GOOG", "http://-a.b.co", pytest.param("foo.com", marks=pytest.mark.xfail()), - pytest.param("http://1.1.1.1.1", marks=pytest.mark.xfail()), - pytest.param("http://www.foo.bar./", marks=pytest.mark.xfail()), + "http://1.1.1.1.1", + "http://www.foo.bar./", ] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 13f799f84..c1ac3dd06 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -17,6 +17,8 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string from .compat import unescape_unicode +from .attrs import intify_attrs +from .symbols import ORTH from .errors import Errors, Warnings, deprecation_warning from . import util @@ -107,6 +109,18 @@ cdef class Tokenizer: if self._property_init_count <= self._property_init_max: self._property_init_count += 1 + property rules: + def __get__(self): + return self._rules + + def __set__(self, rules): + self._rules = {} + self._reset_cache([key for key in self._cache]) + self._reset_specials() + self._cache = PreshMap() + self._specials = PreshMap() + self._load_special_tokenization(rules) + def __reduce__(self): args = (self.vocab, self._rules, @@ -572,7 +586,7 @@ cdef class Tokenizer: attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings] orth = "".join([spec[ORTH] for spec in attrs]) if chunk != orth: - raise ValueError(Errors.E187.format(chunk=chunk, orth=orth, token_attrs=substrings)) + raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings)) def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. @@ -612,6 +626,73 @@ cdef class Tokenizer: self._flush_specials() self._load_special_cases(self._rules) + def explain(self, text): + """A debugging tokenizer that provides information about which + tokenizer rule or pattern was matched for each token. The tokens + produced are identical to `nlp.tokenizer()` except for whitespace + tokens. + + string (unicode): The string to tokenize. + RETURNS (list): A list of (pattern_string, token_string) tuples + + DOCS: https://spacy.io/api/tokenizer#explain + """ + prefix_search = self.prefix_search + suffix_search = self.suffix_search + infix_finditer = self.infix_finditer + token_match = self.token_match + special_cases = {} + for orth, special_tokens in self.rules.items(): + special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] + tokens = [] + for substring in text.split(): + suffixes = [] + while substring: + while prefix_search(substring) or suffix_search(substring): + if substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' + break + if prefix_search(substring): + split = prefix_search(substring).end() + # break if pattern matches the empty string + if split == 0: + break + tokens.append(("PREFIX", substring[:split])) + substring = substring[split:] + if substring in special_cases: + continue + if suffix_search(substring): + split = suffix_search(substring).start() + # break if pattern matches the empty string + if split == len(substring): + break + suffixes.append(("SUFFIX", substring[split:])) + substring = substring[:split] + if substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' + elif token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + elif list(infix_finditer(substring)): + infixes = infix_finditer(substring) + offset = 0 + for match in infixes: + if substring[offset : match.start()]: + tokens.append(("TOKEN", substring[offset : match.start()])) + if substring[match.start() : match.end()]: + tokens.append(("INFIX", substring[match.start() : match.end()])) + offset = match.end() + if substring[offset:]: + tokens.append(("TOKEN", substring[offset:])) + substring = '' + elif substring: + tokens.append(("TOKEN", substring)) + substring = '' + tokens.extend(reversed(suffixes)) + return tokens + def to_disk(self, path, **kwargs): """Save the current state to a directory. diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 5f890de45..a5d06491a 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -329,7 +329,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): doc.c[i].head += offset # Double doc.c max_length if necessary (until big enough for all new tokens) while doc.length + nb_subtokens - 1 >= doc.max_length: - doc._realloc(doc.length * 2) + doc._realloc(doc.max_length * 2) # Move tokens after the split to create space for the new tokens doc.length = len(doc) + nb_subtokens -1 to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 18cb8a234..b60a6d7b3 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -58,6 +58,7 @@ class DocBin(object): self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.tokens = [] self.spaces = [] + self.cats = [] self.user_data = [] self.strings = set() self.store_user_data = store_user_data @@ -82,6 +83,7 @@ class DocBin(object): spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.strings.update(w.text for w in doc) + self.cats.append(doc.cats) if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) @@ -102,6 +104,7 @@ class DocBin(object): words = [vocab.strings[orth] for orth in tokens[:, orth_col]] doc = Doc(vocab, words=words, spaces=spaces) doc = doc.from_array(self.attrs, tokens) + doc.cats = self.cats[i] if self.store_user_data: user_data = srsly.msgpack_loads(self.user_data[i], use_list=False) doc.user_data.update(user_data) @@ -121,6 +124,7 @@ class DocBin(object): self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) self.strings.update(other.strings) + self.cats.extend(other.cats) if self.store_user_data: self.user_data.extend(other.user_data) @@ -140,6 +144,7 @@ class DocBin(object): "spaces": numpy.vstack(self.spaces).tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "strings": list(self.strings), + "cats": self.cats, } if self.store_user_data: msg["user_data"] = self.user_data @@ -164,6 +169,7 @@ class DocBin(object): flat_spaces = flat_spaces.reshape((flat_spaces.size, 1)) self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) + self.cats = msg["cats"] if self.store_user_data and "user_data" in msg: self.user_data = list(msg["user_data"]) for tokens in self.tokens: diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 62665fcc5..7f231887f 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -21,6 +21,9 @@ ctypedef fused LexemeOrToken: cdef int set_children_from_heads(TokenC* tokens, int length) except -1 +cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1 + + cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 6afe89e05..716df1087 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -887,6 +887,7 @@ cdef class Doc: "array_body": lambda: self.to_array(array_head), "sentiment": lambda: self.sentiment, "tensor": lambda: self.tensor, + "cats": lambda: self.cats, } for key in kwargs: if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): @@ -916,6 +917,7 @@ cdef class Doc: "array_body": lambda b: None, "sentiment": lambda b: None, "tensor": lambda b: None, + "cats": lambda b: None, "user_data_keys": lambda b: None, "user_data_values": lambda b: None, } @@ -937,6 +939,8 @@ cdef class Doc: self.sentiment = msg["sentiment"] if "tensor" not in exclude and "tensor" in msg: self.tensor = msg["tensor"] + if "cats" not in exclude and "cats" in msg: + self.cats = msg["cats"] start = 0 cdef const LexemeC* lex cdef unicode orth_ @@ -1153,35 +1157,69 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: tokens[i].r_kids = 0 tokens[i].l_edge = i tokens[i].r_edge = i - # Three times, for non-projectivity. See issue #3170. This isn't a very - # satisfying fix, but I think it's sufficient. - for loop_count in range(3): - # Set left edges - for i in range(length): - child = &tokens[i] - head = &tokens[i + child.head] - if child < head and loop_count == 0: - head.l_kids += 1 - if child.l_edge < head.l_edge: - head.l_edge = child.l_edge - if child.r_edge > head.r_edge: - head.r_edge = child.r_edge - # Set right edges - same as above, but iterate in reverse - for i in range(length-1, -1, -1): - child = &tokens[i] - head = &tokens[i + child.head] - if child > head and loop_count == 0: - head.r_kids += 1 - if child.r_edge > head.r_edge: - head.r_edge = child.r_edge - if child.l_edge < head.l_edge: - head.l_edge = child.l_edge + cdef int loop_count = 0 + cdef bint heads_within_sents = False + # Try up to 10 iterations of adjusting lr_kids and lr_edges in order to + # handle non-projective dependency parses, stopping when all heads are + # within their respective sentence boundaries. We have documented cases + # that need at least 4 iterations, so this is to be on the safe side + # without risking getting stuck in an infinite loop if something is + # terribly malformed. + while not heads_within_sents: + heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) + if loop_count > 10: + user_warning(Warnings.W026) + loop_count += 1 # Set sentence starts for i in range(length): if tokens[i].head == 0 and tokens[i].dep != 0: tokens[tokens[i].l_edge].sent_start = True +cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1: + # May be called multiple times due to non-projectivity. See issues #3170 + # and #4688. + # Set left edges + cdef TokenC* head + cdef TokenC* child + cdef int i, j + for i in range(length): + child = &tokens[i] + head = &tokens[i + child.head] + if child < head and loop_count == 0: + head.l_kids += 1 + if child.l_edge < head.l_edge: + head.l_edge = child.l_edge + if child.r_edge > head.r_edge: + head.r_edge = child.r_edge + # Set right edges - same as above, but iterate in reverse + for i in range(length-1, -1, -1): + child = &tokens[i] + head = &tokens[i + child.head] + if child > head and loop_count == 0: + head.r_kids += 1 + if child.r_edge > head.r_edge: + head.r_edge = child.r_edge + if child.l_edge < head.l_edge: + head.l_edge = child.l_edge + # Get sentence start positions according to current state + sent_starts = set() + for i in range(length): + if tokens[i].head == 0 and tokens[i].dep != 0: + sent_starts.add(tokens[i].l_edge) + cdef int curr_sent_start = 0 + cdef int curr_sent_end = 0 + # Check whether any heads are not within the current sentence + for i in range(length): + if (i > 0 and i in sent_starts) or i == length - 1: + curr_sent_end = i + for j in range(curr_sent_start, curr_sent_end): + if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1: + return False + curr_sent_start = i + return True + + cdef int _get_tokens_lca(Token token_j, Token token_k): """Given two tokens, returns the index of the lowest common ancestor (LCA) among the two. If they have no common ancestor, -1 is returned. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9e99392a9..957e853ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -584,6 +584,22 @@ cdef class Span: else: return self.doc[root] + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): + """Create a `Span` object from the slice `span.text[start : end]`. + + start (int): The index of the first character of the span. + end (int): The index of the first character after the span. + label (uint64 or string): A label to attach to the Span, e.g. for + named entities. + kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of + the span. + RETURNS (Span): The newly constructed object. + """ + start_idx += self.start_char + end_idx += self.start_char + return self.doc.char_span(start_idx, end_idx) + @property def conjuncts(self): """Tokens that are conjoined to the span's root. diff --git a/spacy/util.py b/spacy/util.py index 21c5ea427..693136bc1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -208,7 +208,7 @@ def load_model_from_path(model_path, meta=False, **overrides): factory = factories.get(name, name) component = nlp.create_pipe(factory, config=config) nlp.add_pipe(component, name=name) - return nlp.from_disk(model_path) + return nlp.from_disk(model_path, exclude=disable) def load_model_from_init_py(init_file, **overrides): @@ -301,13 +301,13 @@ def get_component_name(component): return repr(component) -def get_cuda_stream(require=False): +def get_cuda_stream(require=False, non_blocking=True): if CudaStream is None: return None elif isinstance(Model.ops, NumpyOps): return None else: - return CudaStream() + return CudaStream(non_blocking=non_blocking) def get_async(stream, numpy_array): diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 44dddb30c..6b26bf123 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -265,17 +265,12 @@ cdef class Vectors: rows = [self.key2row.get(key, -1.) for key in keys] return xp.asarray(rows, dtype="i") else: - targets = set() + row2key = {row: key for key, row in self.key2row.items()} if row is not None: - targets.add(row) + return row2key[row] else: - targets.update(rows) - results = [] - for key, row in self.key2row.items(): - if row in targets: - results.append(key) - targets.remove(row) - return xp.asarray(results, dtype="uint64") + results = [row2key[row] for row in rows] + return xp.asarray(results, dtype="uint64") def add(self, key, *, vector=None, row=None): """Add a key to the table. Keys can be mapped to an existing vector diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4a21537cb..3cf0095ee 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -3,7 +3,6 @@ from __future__ import unicode_literals from libc.string cimport memcpy -import numpy import srsly from collections import OrderedDict from thinc.neural.util import get_array_module @@ -361,7 +360,8 @@ cdef class Vocab: minn = len(word) if maxn is None: maxn = len(word) - vectors = numpy.zeros((self.vectors_length,), dtype="f") + xp = get_array_module(self.vectors.data) + vectors = xp.zeros((self.vectors_length,), dtype="f") # Fasttext's ngram computation taken from # https://github.com/facebookresearch/fastText ngrams_size = 0; @@ -381,7 +381,7 @@ cdef class Vocab: j = j + 1 if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))): if self.strings[ngram] in self.vectors.key2row: - vectors = numpy.add(self.vectors[self.strings[ngram]],vectors) + vectors = xp.add(self.vectors[self.strings[ngram]], vectors) ngrams_size += 1 n = n + 1 if ngrams_size > 0: diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index a37921f3c..94bedd889 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -166,14 +166,13 @@ All output files generated by this command are compatible with ### Converter options - - | ID | Description | | ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `auto` | Automatically pick converter based on file extension and file content (default). | | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| `jsonl` | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl). | ## Debug data {#debug-data new="2.2"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index ad684f51e..4f948e425 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -7,7 +7,7 @@ source: spacy/tokens/doc.pyx A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to -compressed binary strings. The `Doc` object holds an array of `TokenC]` structs. +compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and [`Span`](/api/span) objects are views of this array, i.e. they don't own the data themselves. diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index 398b71708..feb167a9d 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -122,44 +122,44 @@ The L2 norm of the lexeme's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | The lexeme's vocabulary. | -| `text` | unicode | Verbatim text content. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | -| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | -| `flags` | int | Container of the lexeme's binary flags. | -| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `lower` | int | Lowercase form of the word. | -| `lower_` | unicode | Lowercase form of the word. | -| `shape` | int | Transform of the word's string, to show orthographic features. | -| `shape_` | unicode | Transform of the word's string, to show orthographic features. | -| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | -| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | -| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | -| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | -| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | -| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | -| `is_lower` | bool | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. | -| `is_upper` | bool | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. | -| `is_title` | bool | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. | -| `is_punct` | bool | Is the lexeme punctuation? | -| `is_left_punct` | bool | Is the lexeme a left punctuation mark, e.g. `(`? | -| `is_right_punct` | bool | Is the lexeme a right punctuation mark, e.g. `)`? | -| `is_space` | bool | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. | -| `is_bracket` | bool | Is the lexeme a bracket? | -| `is_quote` | bool | Is the lexeme a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the lexeme a currency symbol? | -| `like_url` | bool | Does the lexeme resemble a URL? | -| `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the lexeme resemble an email address? | -| `is_oov` | bool | Is the lexeme out-of-vocabulary? | -| `is_stop` | bool | Is the lexeme part of a "stop list"? | -| `lang` | int | Language of the parent vocabulary. | -| `lang_` | unicode | Language of the parent vocabulary. | -| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | -| `cluster` | int | Brown cluster ID. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | +| Name | Type | Description | +| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | The lexeme's vocabulary. | +| `text` | unicode | Verbatim text content. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | +| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | +| `flags` | int | Container of the lexeme's binary flags. | +| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | +| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | +| `lower` | int | Lowercase form of the word. | +| `lower_` | unicode | Lowercase form of the word. | +| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | +| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | +| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | +| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | +| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | +| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | +| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | +| `is_lower` | bool | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. | +| `is_upper` | bool | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. | +| `is_title` | bool | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. | +| `is_punct` | bool | Is the lexeme punctuation? | +| `is_left_punct` | bool | Is the lexeme a left punctuation mark, e.g. `(`? | +| `is_right_punct` | bool | Is the lexeme a right punctuation mark, e.g. `)`? | +| `is_space` | bool | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. | +| `is_bracket` | bool | Is the lexeme a bracket? | +| `is_quote` | bool | Is the lexeme a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the lexeme a currency symbol? | +| `like_url` | bool | Does the lexeme resemble a URL? | +| `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the lexeme resemble an email address? | +| `is_oov` | bool | Is the lexeme out-of-vocabulary? | +| `is_stop` | bool | Is the lexeme part of a "stop list"? | +| `lang` | int | Language of the parent vocabulary. | +| `lang_` | unicode | Language of the parent vocabulary. | +| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | +| `cluster` | int | Brown cluster ID. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index c7311a401..90ecd3416 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -38,6 +38,7 @@ be shown. | Name | Type | Description | | --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `max_length` | int | Deprecated argument - the `PhraseMatcher` does not have a phrase length limit anymore. | | `attr` 2.1 | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | | `validate` 2.1 | bool | Validate patterns added to the matcher. | | **RETURNS** | `PhraseMatcher` | The newly constructed object. | diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 35348217b..b1824573c 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -58,4 +58,5 @@ Update the evaluation scores from a single [`Doc`](/api/doc) / | `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | | `textcat_score` 2.2 | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). | | `textcats_per_cat` 2.2 | dict | Scores per textcat label, keyed by label. | +| `las_per_type` 2.2.3 | dict | Labelled dependency scores, keyed by label. | | `scores` | dict | All scores, keyed by type. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 237cd6a8a..c9b935f22 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -60,7 +60,7 @@ the component has been added to the pipeline using > sentencizer = nlp.create_pipe("sentencizer") > nlp.add_pipe(sentencizer) > doc = nlp("This is a sentence. This is another sentence.") -> assert list(doc.sents) == 2 +> assert len(list(doc.sents)) == 2 > ``` | Name | Type | Description | diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 8d7ee5928..68402d1b4 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -408,71 +408,71 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | unicode | Verbatim text content. | -| `text_with_ws` | unicode | Text content, with trailing space character if present. | -| `whitespace_` | unicode | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". | -| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `(`? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `)`? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech. | -| `pos_` | unicode | Coarse-grained part-of-speech. | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | unicode | Fine-grained part-of-speech. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | unicode | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | unicode | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| -------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | unicode | Verbatim text content. | +| `text_with_ws` | unicode | Text content, with trailing space character if present. | +| `whitespace_` | unicode | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | unicode | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech. | +| `pos_` | unicode | Coarse-grained part-of-speech. | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | unicode | Fine-grained part-of-speech. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | unicode | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | unicode | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index d6ab73f14..7462af739 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,15 +34,15 @@ the > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` -| Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A boolean function matching strings to be recognized as tokens. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -128,6 +128,25 @@ and examples. | `string` | unicode | The string to specially tokenize. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | +## Tokenizer.explain {#explain tag="method"} + +Tokenize a string with a slow debugging tokenizer that provides information +about which tokenizer rule or pattern was matched for each token. The tokens +produced are identical to `Tokenizer.__call__` except for whitespace tokens. + +> #### Example +> +> ```python +> tok_exp = nlp.tokenizer.explain("(don't)") +> assert [t[0] for t in tok_exp] == ["PREFIX", "SPECIAL-1", "SPECIAL-2", "SUFFIX"] +> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] +> ``` + +| Name | Type | Description | +| ------------| -------- | --------------------------------------------------- | +| `string` | unicode | The string to tokenize with the debugging tokenizer | +| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | + ## Tokenizer.to_disk {#to_disk tag="method"} Serialize the tokenizer to disk. @@ -198,12 +217,14 @@ it. ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| Name | Type | Description | +| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | +| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | +| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | +| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 1d2bd6d63..31bc3c549 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -1,31 +1,29 @@ --- title: Models -teaser: Downloadable statistical models for spaCy to predict linguistic features +teaser: Downloadable pretrained models for spaCy menu: - ['Quickstart', 'quickstart'] - ['Model Architecture', 'architecture'] - ['Conventions', 'conventions'] --- -spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity -recognition**. The models have been designed and implemented from scratch -specifically for spaCy, to give you an unmatched balance of speed, size and -accuracy. A novel bloom embedding strategy with subword features is used to -support huge vocabularies in tiny tables. Convolutional layers with residual -connections, layer normalization and maxout non-linearity are used, giving much -better efficiency than the standard BiLSTM solution. For more details, see the -notes on the [model architecture](#architecture). +The models directory includes two types of pretrained models: -The parser and NER use an imitation learning objective to deliver **accuracy -in-line with the latest research systems**, even when evaluated from raw text. -With these innovations, spaCy v2.0's models are **10× smaller**, **20% more -accurate**, and **even cheaper to run** than the previous generation. +1. **Core models:** General-purpose pretrained models to predict named entities, + part-of-speech tags and syntactic dependencies. Can be used out-of-the-box + and fine-tuned on more specific data. +2. **Starter models:** Transfer learning starter packs with pretrained weights + you can initialize your models with to achieve better accuracy. They can + include word vectors (which will be used as features during training) or + other pretrained representations like BERT. These models don't include + components for specific tasks like NER or text classification and are + intended to be used as base models when training your own models. ### Quickstart {hidden="true"} import QuickstartModels from 'widgets/quickstart-models.js' - + @@ -36,10 +34,20 @@ For more details on how to use models with spaCy, see the ## Model architecture {#architecture} -spaCy's statistical models have been custom-designed to give a high-performance -mix of speed and accuracy. The current architecture hasn't been published yet, -but in the meantime we prepared a video that explains how the models work, with -particular focus on NER. +spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity +recognition**. The models have been designed and implemented from scratch +specifically for spaCy, to give you an unmatched balance of speed, size and +accuracy. A novel bloom embedding strategy with subword features is used to +support huge vocabularies in tiny tables. Convolutional layers with residual +connections, layer normalization and maxout non-linearity are used, giving much +better efficiency than the standard BiLSTM solution. + +The parser and NER use an imitation learning objective to deliver **accuracy +in-line with the latest research systems**, even when evaluated from raw text. +With these innovations, spaCy v2.0's models are **10× smaller**, **20% more +accurate**, and **even cheaper to run** than the previous generation. The +current architecture hasn't been published yet, but in the meantime we prepared +a video that explains how the models work, with particular focus on NER. diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md index 0e8784187..0dfee8636 100644 --- a/website/docs/usage/101/_named-entities.md +++ b/website/docs/usage/101/_named-entities.md @@ -1,9 +1,10 @@ A named entity is a "real-world object" that's assigned a name – for example, a -person, a country, a product or a book title. spaCy can **recognize** -[various types](/api/annotation#named-entities) of named entities in a document, -by asking the model for a **prediction**. Because models are statistical and -strongly depend on the examples they were trained on, this doesn't always work -_perfectly_ and might need some tuning later, depending on your use case. +person, a country, a product or a book title. spaCy can **recognize +[various types](/api/annotation#named-entities)** of named entities in a +document, by asking the model for a **prediction**. Because models are +statistical and strongly depend on the examples they were trained on, this +doesn't always work _perfectly_ and might need some tuning later, depending on +your use case. Named entities are available as the `ents` property of a `Doc`: diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 73c35950f..9ff55f815 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -68,8 +68,8 @@ representation consists of 300 dimensions of `0`, which means it's practically nonexistent. If your application will benefit from a **large vocabulary** with more vectors, you should consider using one of the larger models or loading in a full vector package, for example, -[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), which includes over **1 -million unique vectors**. +[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes +over **1 million unique vectors**. spaCy is able to compare two objects, and make a prediction of **how similar they are**. Predicting similarity is useful for building recommendation systems diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 039534fb7..3af7d9fd1 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -435,22 +435,22 @@ import spacy from spacy.tokens import Span nlp = spacy.load("en_core_web_sm") -doc = nlp("FB is hiring a new Vice President of global policy") +doc = nlp("fb is hiring a new vice president of global policy") ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) -# the model didn't recognise "FB" as an entity :( +# the model didn't recognise "fb" as an entity :( fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity doc.ents = list(doc.ents) + [fb_ent] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('After', ents) -# [('FB', 0, 2, 'ORG')] 🎉 +# [('fb', 0, 2, 'ORG')] 🎉 ``` Keep in mind that you need to create a `Span` with the start and end index of the **token**, not the start and end index of the entity in the document. In -this case, "FB" is token `(0, 1)` – but at the document level, the entity will +this case, "fb" is token `(0, 1)` – but at the document level, the entity will have the start and end indices `(0, 2)`. #### Setting entity annotations from array {#setting-from-array} @@ -715,7 +715,7 @@ assert "gimme" not in [w.text for w in nlp('("...gimme...?")')] The special case rules have precedence over the punctuation splitting: ```python -nlp.tokenizer.add_special_case("...gimme...?", [{ORTH: "...gimme...?"}]) +nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}]) assert len(nlp("...gimme...?")) == 1 ``` @@ -725,40 +725,52 @@ spaCy introduces a novel tokenization algorithm, that gives a better balance between performance, ease of definition, and ease of alignment into the original string. -After consuming a prefix or infix, we consult the special cases again. We want +After consuming a prefix or suffix, we consult the special cases again. We want the special cases to handle things like "don't" in English, and we want the same rule to work for "(don't)!". We do this by splitting off the open bracket, then -the exclamation, then the close bracket, and finally matching the special-case. +the exclamation, then the close bracket, and finally matching the special case. Here's an implementation of the algorithm in Python, optimized for readability rather than performance: ```python -def tokenizer_pseudo_code(text, special_cases, - find_prefix, find_suffix, find_infixes): +def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, + infix_finditer, token_match): tokens = [] - for substring in text.split(' '): + for substring in text.split(): suffixes = [] while substring: + while prefix_search(substring) or suffix_search(substring): + if substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' + break + if prefix_search(substring): + split = prefix_search(substring).end() + tokens.append(substring[:split]) + substring = substring[split:] + if substring in special_cases: + continue + if suffix_search(substring): + split = suffix_search(substring).start() + suffixes.append(substring[split:]) + substring = substring[:split] if substring in special_cases: tokens.extend(special_cases[substring]) substring = '' - elif find_prefix(substring) is not None: - split = find_prefix(substring) - tokens.append(substring[:split]) - substring = substring[split:] - elif find_suffix(substring) is not None: - split = find_suffix(substring) - suffixes.append(substring[-split:]) - substring = substring[:-split] - elif find_infixes(substring): - infixes = find_infixes(substring) + elif token_match(substring): + tokens.append(substring) + substring = '' + elif list(infix_finditer(substring)): + infixes = infix_finditer(substring) offset = 0 for match in infixes: tokens.append(substring[offset : match.start()]) tokens.append(substring[match.start() : match.end()]) offset = match.end() - substring = substring[offset:] - else: + if substring[offset:]: + tokens.append(substring[offset:]) + substring = '' + elif substring: tokens.append(substring) substring = '' tokens.extend(reversed(suffixes)) @@ -767,16 +779,45 @@ def tokenizer_pseudo_code(text, special_cases, The algorithm can be summarized as follows: -1. Iterate over space-separated substrings +1. Iterate over whitespace-separated substrings. 2. Check whether we have an explicitly defined rule for this substring. If we do, use it. -3. Otherwise, try to consume a prefix. -4. If we consumed a prefix, go back to the beginning of the loop, so that - special-cases always get priority. -5. If we didn't consume a prefix, try to consume a suffix. -6. If we can't consume a prefix or suffix, look for "infixes" — stuff like - hyphens etc. -7. Once we can't consume any more of the string, handle it as a single token. +3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, + so that special cases always get priority. +4. If we didn't consume a prefix, try to consume a suffix and then go back to + #2. +5. If we can't consume a prefix or a suffix, look for a special case. +6. Next, look for a token match. +7. Look for "infixes" — stuff like hyphens etc. and split the substring into + tokens on all infixes. +8. Once we can't consume any more of the string, handle it as a single token. + +#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} + +A working implementation of the pseudo-code above is available for debugging as +[`nlp.tokenizer.explain(text)`](/api/tokenizer#explain). It returns a list of +tuples showing which tokenizer rule or pattern was matched for each token. The +tokens produced are identical to `nlp.tokenizer()` except for whitespace tokens: + +```python +### {executable="true"} +from spacy.lang.en import English + +nlp = English() +text = '''"Let's go!"''' +doc = nlp(text) +tok_exp = nlp.tokenizer.explain(text) +assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp] +for t in tok_exp: + print(t[1], "\\t", t[0]) + +# " PREFIX +# Let SPECIAL-1 +# 's SPECIAL-2 +# go TOKEN +# ! SUFFIX +# " SUFFIX +``` ### Customizing spaCy's Tokenizer class {#native-tokenizers} @@ -792,8 +833,9 @@ domain. There are five things you would need to define: 4. A function `infixes_finditer`, to handle non-whitespace separators, such as hyphens etc. 5. An optional boolean function `token_match` matching strings that should never - be split, overriding the previous rules. Useful for things like URLs or - numbers. + be split, overriding the infix rules. Useful for things like URLs or numbers. + Note that prefixes and suffixes will be split off before `token_match` is + applied. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its @@ -805,21 +847,23 @@ import re import spacy from spacy.tokenizer import Tokenizer +special_cases = {":)": [{"ORTH": ":)"}]} prefix_re = re.compile(r'''^[\[\("']''') suffix_re = re.compile(r'''[\]\)"']$''') infix_re = re.compile(r'''[-~]''') simple_url_re = re.compile(r'''^https?://''') def custom_tokenizer(nlp): - return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, + return Tokenizer(nlp.vocab, rules=special_cases, + prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=simple_url_re.match) nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) -doc = nlp("hello-world.") -print([t.text for t in doc]) +doc = nlp("hello-world. :)") +print([t.text for t in doc]) # ['hello', '-', 'world.', ':)'] ``` If you need to subclass the tokenizer instead, the relevant methods to @@ -838,15 +882,16 @@ only be applied at the **end of a token**, so your expression should end with a -#### Adding to existing rule sets {#native-tokenizer-additions} +#### Modifying existing rule sets {#native-tokenizer-additions} In many situations, you don't necessarily need entirely custom rules. Sometimes you just want to add another character to the prefixes, suffixes or infixes. The default prefix, suffix and infix rules are available via the `nlp` object's -`Defaults` and the [`Tokenizer.suffix_search`](/api/tokenizer#attributes) -attribute is writable, so you can overwrite it with a compiled regular -expression object using of the modified default rules. spaCy ships with utility -functions to help you compile the regular expressions – for example, +`Defaults` and the `Tokenizer` attributes such as +[`Tokenizer.suffix_search`](/api/tokenizer#attributes) are writable, so you can +overwrite them with compiled regular expression objects using modified default +rules. spaCy ships with utility functions to help you compile the regular +expressions – for example, [`compile_suffix_regex`](/api/top-level#util.compile_suffix_regex): ```python @@ -855,8 +900,15 @@ suffix_regex = spacy.util.compile_suffix_regex(suffixes) nlp.tokenizer.suffix_search = suffix_regex.search ``` -For an overview of the default regular expressions, see -[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py). +Similarly, you can remove a character from the default suffixes: + +```python +suffixes = list(nlp.Defaults.suffixes) +suffixes.remove("\\\\[") +suffix_regex = spacy.util.compile_suffix_regex(suffixes) +nlp.tokenizer.suffix_search = suffix_regex.search +``` + The `Tokenizer.suffix_search` attribute should be a function which takes a unicode string and returns a **regex match object** or `None`. Usually we use the `.search` attribute of a compiled regex object, but you can use some other @@ -866,12 +918,61 @@ function that behaves the same way. If you're using a statistical model, writing to the `nlp.Defaults` or `English.Defaults` directly won't work, since the regular expressions are read -from the model and will be compiled when you load it. You'll only see the effect -if you call [`spacy.blank`](/api/top-level#spacy.blank) or -`Defaults.create_tokenizer()`. +from the model and will be compiled when you load it. If you modify +`nlp.Defaults`, you'll only see the effect if you call +[`spacy.blank`](/api/top-level#spacy.blank) or `Defaults.create_tokenizer()`. If +you want to modify the tokenizer loaded from a statistical model, you should +modify `nlp.tokenizer` directly. +The prefix, infix and suffix rule sets include not only individual characters +but also detailed regular expressions that take the surrounding context into +account. For example, there is a regular expression that treats a hyphen between +letters as an infix. If you do not want the tokenizer to split on hyphens +between letters, you can modify the existing infix definition from +[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py): + +```python +### {executable="true"} +import spacy +from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER +from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS +from spacy.util import compile_infix_regex + +# default tokenizer +nlp = spacy.load("en_core_web_sm") +doc = nlp("mother-in-law") +print([t.text for t in doc]) # ['mother', '-', 'in', '-', 'law'] + +# modify tokenizer infix patterns +infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\\-\\*^](?=[0-9-])", + r"(?<=[{al}{q}])\\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + # EDIT: commented out regex that splits on hyphens between letters: + #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +infix_re = compile_infix_regex(infixes) +nlp.tokenizer.infix_finditer = infix_re.finditer +doc = nlp("mother-in-law") +print([t.text for t in doc]) # ['mother-in-law'] +``` + +For an overview of the default regular expressions, see +[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py) +and language-specific definitions such as +[`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py) +for German. + ### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer} The tokenizer is the first component of the processing pipeline and the only one @@ -999,10 +1100,10 @@ can sometimes tokenize things differently – for example, `"I'm"` → In situations like that, you often want to align the tokenization so that you can merge annotations from different sources together, or take vectors predicted by a -[pretrained BERT model](https://github.com/huggingface/pytorch-transformers) -and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align) -helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the -number of misaligned tokens, the one-to-one mappings of token indices in both +[pretrained BERT model](https://github.com/huggingface/pytorch-transformers) and +apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align) helper +returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the number +of misaligned tokens, the one-to-one mappings of token indices in both directions and the indices where multiple tokens align to one single token. > #### ✏️ Things to try diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 663ac5e5a..3aa2e417c 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -638,7 +638,7 @@ punctuation – depending on the The `IS_DIGIT` flag is not very helpful here, because it doesn't tell us anything about the length. However, you can use the `SHAPE` flag, with each `d` -representing a digit: +representing a digit (up to 4 digits / characters): ```python [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, @@ -654,7 +654,7 @@ match the most common formats of ```python [{"ORTH": "+"}, {"ORTH": "49"}, {"ORTH": "(", "OP": "?"}, {"SHAPE": "dddd"}, - {"ORTH": ")", "OP": "?"}, {"SHAPE": "dddddd"}] + {"ORTH": ")", "OP": "?"}, {"SHAPE": "dddd", "LENGTH": 6}] ``` Depending on the formats your application needs to match, creating an extensive @@ -986,37 +986,6 @@ doc = nlp("Apple is opening its first big office in San Francisco.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` -### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"} - -The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each -pattern. Using the `id` attribute allows multiple patterns to be associated with -the same entity. - -```python -### {executable="true"} -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - -nlp = English() -ruler = EntityRuler(nlp) -patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}, - {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"}, - {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}] -ruler.add_patterns(patterns) -nlp.add_pipe(ruler) - -doc1 = nlp("Apple is opening its first big office in San Francisco.") -print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents]) - -doc2 = nlp("Apple is opening its first big office in San Fran.") -print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents]) -``` - -If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) -patterns, the `ent_id_` property of the matched entity is set to the `id` given -in the patterns. So in the example above it's easy to identify that "San -Francisco" and "San Fran" are both the same entity. - The entity ruler is designed to integrate with spaCy's existing statistical models and enhance the named entity recognizer. If it's added **before the `"ner"` component**, the entity recognizer will respect the existing entity @@ -1051,6 +1020,37 @@ The `EntityRuler` can validate patterns against a JSON schema with the option ruler = EntityRuler(nlp, validate=True) ``` +### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"} + +The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each +pattern. Using the `id` attribute allows multiple patterns to be associated with +the same entity. + +```python +### {executable="true"} +from spacy.lang.en import English +from spacy.pipeline import EntityRuler + +nlp = English() +ruler = EntityRuler(nlp) +patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}, + {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"}, + {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}] +ruler.add_patterns(patterns) +nlp.add_pipe(ruler) + +doc1 = nlp("Apple is opening its first big office in San Francisco.") +print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents]) + +doc2 = nlp("Apple is opening its first big office in San Fran.") +print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents]) +``` + +If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) +patterns, the `ent_id_` property of the matched entity is set to the `id` given +in the patterns. So in the example above it's easy to identify that "San +Francisco" and "San Fran" are both the same entity. + ### Using pattern files {#entityruler-files} The [`to_disk`](/api/entityruler#to_disk) and diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index da56f2397..5a3a95a53 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -714,8 +714,8 @@ print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector) ``` For the best results, you should run this example using the -[`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model (currently not -available in the live demo). +[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) model (currently +not available in the live demo). diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index 53648f66e..0bb79779e 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -95,8 +95,9 @@ pruning the vectors will be taken care of automatically if you set the `--prune-vectors` flag. You can also do it manually in the following steps: 1. Start with a **word vectors model** that covers a huge vocabulary. For - instance, the [`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model - provides 300-dimensional GloVe vectors for over 1 million terms of English. + instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) + model provides 300-dimensional GloVe vectors for over 1 million terms of + English. 2. If your vocabulary has values set for the `Lexeme.prob` attribute, the lexemes will be sorted by descending probability to determine which vectors to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. @@ -203,7 +204,7 @@ nlp.vocab.vectors.from_glove("/path/to/vectors") If your instance of `Language` already contains vectors, they will be overwritten. To create your own GloVe vectors model package like spaCy's -[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), you can call +[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call [`nlp.to_disk`](/api/language#to_disk), and then package the model using the [`package`](/api/cli#package) command. @@ -219,7 +220,7 @@ tokens. You can customize these behaviors by modifying the `doc.user_hooks`, For more details on **adding hooks** and **overwriting** the built-in `Doc`, `Span` and `Token` methods, see the usage guide on -[user hooks](/usage/processing-pipelines#user-hooks). +[user hooks](/usage/processing-pipelines#custom-components-user-hooks). diff --git a/website/gatsby-node.js b/website/gatsby-node.js index 4aaf5f45e..fe9f22888 100644 --- a/website/gatsby-node.js +++ b/website/gatsby-node.js @@ -33,6 +33,7 @@ exports.createPages = ({ graphql, actions }) => { code name models + starters example has_examples } @@ -210,6 +211,8 @@ exports.createPages = ({ graphql, actions }) => { const langs = result.data.site.siteMetadata.languages const modelLangs = langs.filter(({ models }) => models && models.length) + const starterLangs = langs.filter(({ starters }) => starters && starters.length) + modelLangs.forEach(({ code, name, models, example, has_examples }, i) => { const slug = `/models/${code}` const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null @@ -229,6 +232,28 @@ exports.createPages = ({ graphql, actions }) => { }, }) }) + + starterLangs.forEach(({ code, name, starters }, i) => { + const slug = `/models/${code}-starters` + const next = i < starterLangs.length - 1 ? starterLangs[i + 1] : null + createPage({ + path: slug, + component: DEFAULT_TEMPLATE, + context: { + id: `${code}-starters`, + slug: slug, + isIndex: false, + title: name, + section: 'models', + sectionTitle: sections.models.title, + theme: sections.models.theme, + next: next + ? { title: next.name, slug: `/models/${next.code}-starters` } + : null, + meta: { models: starters, isStarters: true }, + }, + }) + }) }) ) }) diff --git a/website/meta/languages.json b/website/meta/languages.json index dbb300fbf..c22ddad69 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -3,10 +3,8 @@ { "code": "en", "name": "English", - "models": [ - "en_core_web_sm", - "en_core_web_md", - "en_core_web_lg", + "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"], + "starters": [ "en_vectors_web_lg", "en_trf_bertbaseuncased_lg", "en_trf_robertabase_lg", @@ -19,7 +17,8 @@ { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"], + "models": ["de_core_news_sm", "de_core_news_md"], + "starters": ["de_trf_bertbasecased_lg"], "example": "Dies ist ein Satz.", "has_examples": true }, @@ -155,7 +154,8 @@ "name": "Japanese", "dependencies": [ { "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" }, - { "name": "Mecab", "url": "https://github.com/taku910/mecab" } + { "name": "Mecab", "url": "https://github.com/taku910/mecab" }, + { "name": "fugashi", "url": "https://github.com/polm/fugashi" } ], "example": "これは文章です。", "has_examples": true diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 68d46605f..3fafc52b0 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -41,7 +41,11 @@ "items": [{ "text": "Overview", "url": "/models" }] }, { - "label": "Language Models", + "label": "Core Models", + "items": [] + }, + { + "label": "Starter Models", "items": [] } ] diff --git a/website/meta/universe.json b/website/meta/universe.json index 40ebfaaa7..67da8c828 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1261,6 +1261,21 @@ }, "category": ["podcasts"] }, + { + "type": "education", + "id": "practical-ai-podcast", + "title": "Practical AI: Modern NLP with spaCy", + "slogan": "December 2019", + "description": "\"SpaCy is awesome for NLP! It’s easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You don’t want to miss this episode!\"", + "thumb": "https://i.imgur.com/jn8Bcdw.png", + "url": "https://changelog.com/practicalai/68", + "author": "Daniel Whitenack & Chris Benson", + "author_links": { + "website": "https://changelog.com/practicalai", + "twitter": "https://twitter.com/PracticalAIFM" + }, + "category": ["podcasts"] + }, { "id": "adam_qas", "title": "ADAM: Question Answering System", @@ -1679,13 +1694,14 @@ "slogan": "Information extraction from English and German texts based on predicate logic", "github": "msg-systems/holmes-extractor", "url": "https://github.com/msg-systems/holmes-extractor", - "description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural search, topic matching and supervised document classification.", + "description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural extraction, topic matching and supervised document classification. There is a [website demonstrating intelligent search based on topic matching](https://holmes-demo.xt.msg.team).", "pip": "holmes-extractor", "category": ["conversational", "standalone"], "tags": ["chatbots", "text-processing"], + "thumb": "https://raw.githubusercontent.com/msg-systems/holmes-extractor/master/docs/holmes_thumbnail.png", "code_example": [ "import holmes_extractor as holmes", - "holmes_manager = holmes.Manager(model='en_coref_lg')", + "holmes_manager = holmes.Manager(model='en_core_web_lg')", "holmes_manager.register_search_phrase('A big dog chases a cat')", "holmes_manager.start_chatbot_mode_console()" ], diff --git a/website/src/templates/docs.js b/website/src/templates/docs.js index 130506264..840dcbf1f 100644 --- a/website/src/templates/docs.js +++ b/website/src/templates/docs.js @@ -50,6 +50,17 @@ const Docs = ({ pageContext, children }) => ( id: model, })), })) + sidebar.items[2].items = languages + .filter(({ starters }) => starters && starters.length) + .map(lang => ({ + text: lang.name, + url: `/models/${lang.code}-starters`, + isActive: id === `${lang.code}-starters`, + menu: lang.starters.map(model => ({ + text: model, + id: model, + })), + })) } const sourcePath = source ? github(source) : null const currentSource = getCurrentSource(slug, isIndex) @@ -133,6 +144,7 @@ const query = graphql` code name models + starters } sidebars { section diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 3ac5e6ebf..845fec65d 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -331,7 +331,7 @@ const Models = ({ pageContext, repo, children }) => { const [initialized, setInitialized] = useState(false) const [compatibility, setCompatibility] = useState({}) const { id, title, meta } = pageContext - const { models } = meta + const { models, isStarters } = meta const baseUrl = `https://raw.githubusercontent.com/${repo}/master` useEffect(() => { @@ -345,9 +345,27 @@ const Models = ({ pageContext, repo, children }) => { } }, [initialized, baseUrl]) + const modelTitle = title + const modelTeaser = `Available pretrained statistical models for ${title}` + + const starterTitle = `${title} starters` + const starterTeaser = `Available transfer learning starter packs for ${title}` + return ( <> - + <Title + title={isStarters ? starterTitle : modelTitle} + teaser={isStarters ? starterTeaser : modelTeaser} + /> + {isStarters && ( + <Section> + <p> + Starter packs are pretrained weights you can initialize your models with to + achieve better accuracy. They can include word vectors (which will be used + as features during training) or other pretrained representations like BERT. + </p> + </Section> + )} <StaticQuery query={query} render={({ site }) => @@ -360,7 +378,6 @@ const Models = ({ pageContext, repo, children }) => { compatibility={compatibility} baseUrl={baseUrl} repo={repo} - hasExamples={meta.hasExamples} licenses={arrayToObj(site.siteMetadata.licenses, 'id')} /> )) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 2b7dc10c1..2dc5d40dc 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -56,7 +56,11 @@ function getCounts(langs = []) { return { langs: langs.length, modelLangs: langs.filter(({ models }) => models && !!models.length).length, + starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length, models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0), + starters: langs + .map(({ starters }) => (starters ? starters.length : 0)) + .reduce((a, b) => a + b, 0), } } @@ -270,6 +274,7 @@ const landingQuery = graphql` repo languages { models + starters } logosUsers { id