diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index c915d48bf..f34603065 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,8 +87,8 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person - or entity, including my employer, has or will have rights with respect my + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my contributions. * [ ] I am signing on behalf of my employer or a legal entity and I have the @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shuvanon Razik | +| Name | | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 3/12/2017 | -| GitHub username | shuvanon | +| Date | | +| GitHub username | | | Website (optional) | | diff --git a/.github/contributors/honnibal.md b/.github/contributors/honnibal.md new file mode 100644 index 000000000..3a700b7dd --- /dev/null +++ b/.github/contributors/honnibal.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthew Honnibal | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017-10-18 | +| GitHub username | honnibal | +| Website (optional) | https://explosion.ai | diff --git a/.github/contributors/ines.md b/.github/contributors/ines.md new file mode 100644 index 000000000..5cd57b07e --- /dev/null +++ b/.github/contributors/ines.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ines Montani | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017/10/18 | +| GitHub username | ines | +| Website (optional) | https://explosion.ai | diff --git a/.github/contributors/mdcclv.md b/.github/contributors/mdcclv.md new file mode 100644 index 000000000..14ebfae26 --- /dev/null +++ b/.github/contributors/mdcclv.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------- | +| Name | Orion Montoya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 04-10-2017 | +| GitHub username | mdcclv | +| Website (optional) | http://www.mdcclv.com/ | diff --git a/.github/contributors/polm.md b/.github/contributors/polm.md new file mode 100644 index 000000000..a2aa0cb65 --- /dev/null +++ b/.github/contributors/polm.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paul McCann | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-14 | +| GitHub username | polm | +| Website (optional) | http://dampfkraft.com| diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md new file mode 100644 index 000000000..82d02d8d2 --- /dev/null +++ b/.github/contributors/shuvanon.md @@ -0,0 +1,108 @@ + + +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Shuvanon Razik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 3/12/2017 | +| GitHub username | shuvanon | +| Website (optional) | | diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md new file mode 100644 index 000000000..aecafeecb --- /dev/null +++ b/.github/contributors/yuukos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alexey Kim | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13-12-2017 | +| GitHub username | yuukos | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index 2209f5b4a..ecd8ed39f 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ Profile.prof .python-version __pycache__/ *.py[cod] +.env*/ .env/ .env2/ .env3/ @@ -101,3 +102,7 @@ Desktop.ini # Other *.tgz + + +# JetBrains PyCharm +.idea/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8a9ab517b..7cc47296c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,7 +70,7 @@ The [spaCy developer resources](https://github.com/explosion/spacy-dev-resources ### Contributor agreement -If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template]((.github/CONTRIBUTOR_AGREEMENT.md)) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user +If you've made a substantial contribution to spaCy, you should fill in the [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that your contribution can be used across the project. If you agree to be bound by the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md) and include it with your pull request, or sumit it separately to [`.github/contributors/`](/.github/contributors). The name of the file should be your GitHub username, with the extension `.md`. For example, the user example_user would create the file `.github/contributors/example_user.md`. diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ea6096a52..edd1ed30d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,8 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexey Kim, [@yuukos](https://github.com/yuukos) +* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) @@ -25,6 +27,9 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Geovedi, [@geovedi](https://github.com/geovedi) +* Jim Regan, [@jimregan](https://github.com/jimregan) +* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) @@ -39,6 +44,8 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Orion Montoya, [@mdcclv](https://github.com/mdcclv) +* Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) @@ -46,12 +53,18 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov, [@savkov](https://github.com/savkov) * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon) +* Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) +* Vimos Tan, [@Vimos](https://github.com/Vimos) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) * Wah Loon Keng, [@kengz](https://github.com/kengz) +* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom) * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) +* Yam, [@hscspring](https://github.com/hscspring) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) +* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh) * Yubing Dong, [@tomtung](https://github.com/tomtung) +* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) diff --git a/README.rst b/README.rst index 0fd807388..83e8fa8db 100644 --- a/README.rst +++ b/README.rst @@ -9,9 +9,9 @@ Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, released under the MIT license. -⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. `_ +⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes. `_ -💫 **Version 1.8 out now!** `Read the release notes here. `_ +💫 **Version 1.9 out now!** `Read the release notes here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy @@ -63,11 +63,12 @@ MIT license. 💬 Where to ask questions ========================== +Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. + ====================== === **Bug reports** `GitHub issue tracker`_ **Usage questions** `StackOverflow`_, `Gitter chat`_, `Reddit user group`_ **General discussion** `Gitter chat`_, `Reddit user group`_ -**Commercial support** contact@explosion.ai ====================== === .. _GitHub issue tracker: https://github.com/explosion/spaCy/issues @@ -325,6 +326,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.9.0`_ ``2017-07-22`` Spanish model, alpha support for Norwegian & Japanese, and bug fixes `v1.8.2`_ ``2017-04-26`` French model and small improvements `v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes `v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading @@ -358,6 +360,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.9.0: https://github.com/explosion/spaCy/releases/tag/v1.9.0 .. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2 .. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 diff --git a/examples/chainer_sentiment.py b/examples/chainer_sentiment.py deleted file mode 100644 index 747ef508a..000000000 --- a/examples/chainer_sentiment.py +++ /dev/null @@ -1,322 +0,0 @@ -'''WIP --- Doesn't work well yet''' -import plac -import random -import six - -import cProfile -import pstats - -import pathlib -import cPickle as pickle -from itertools import izip - -import spacy - -import cytoolz -import cupy as xp -import cupy.cuda -import chainer.cuda - -import chainer.links as L -import chainer.functions as F -from chainer import Chain, Variable, report -import chainer.training -import chainer.optimizers -from chainer.training import extensions -from chainer.iterators import SerialIterator -from chainer.datasets import TupleDataset - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - raise NotImplementedError - #with (path / 'config.json').open() as file_: - # model = model_from_json(file_.read()) - #with (path / 'model').open('rb') as file_: - # lstm_weights = pickle.load(file_) - #embeddings = get_embeddings(nlp.vocab) - #model.set_weights([embeddings] + lstm_weights) - #return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000, n_threads=2): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -class Classifier(Chain): - def __init__(self, predictor): - super(Classifier, self).__init__(predictor=predictor) - - def __call__(self, x, t): - y = self.predictor(x) - loss = F.softmax_cross_entropy(y, t) - accuracy = F.accuracy(y, t) - report({'loss': loss, 'accuracy': accuracy}, self) - return loss - - -class SentimentModel(Chain): - def __init__(self, nlp, shape, **settings): - Chain.__init__(self, - embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], - set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), - encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), - attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), - predict=_Predict(shape['nr_hidden'], shape['nr_class'])) - self.to_gpu(0) - - def __call__(self, sentence): - return self.predict( - self.attend( - self.encode( - self.embed(sentence)))) - - -class _Embed(Chain): - def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): - Chain.__init__(self, - embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), - project=L.Linear(None, nr_out, nobias=True)) - self.embed.W.volatile = False - - def __call__(self, sentence): - return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] - - -class _Encode(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - fwd=L.LSTM(nr_in, nr_out), - bwd=L.LSTM(nr_in, nr_out), - mix=L.Bilinear(nr_out, nr_out, nr_out)) - - def __call__(self, sentence): - self.fwd.reset_state() - fwds = map(self.fwd, sentence) - self.bwd.reset_state() - bwds = reversed(map(self.bwd, reversed(sentence))) - return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] - - -class _Attend(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self) - - def __call__(self, sentence): - sent = sum(sentence) - return sent - - -class _Predict(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - l1=L.Linear(nr_in, nr_in), - l2=L.Linear(nr_in, nr_out)) - - def __call__(self, vector): - vector = self.l1(vector) - vector = F.elu(vector) - vector = self.l2(vector) - return vector - - -class SentenceDataset(TupleDataset): - def __init__(self, nlp, texts, labels, max_length): - self.max_length = max_length - sents, labels = self._get_labelled_sentences( - nlp.pipe(texts, batch_size=5000, n_threads=3), - labels) - TupleDataset.__init__(self, - get_features(sents, max_length), - labels) - - def __getitem__(self, index): - batches = [dataset[index] for dataset in self._datasets] - if isinstance(index, slice): - length = len(batches[0]) - returns = [tuple([batch[i] for batch in batches]) - for i in six.moves.range(length)] - return returns - else: - return tuple(batches) - - def _get_labelled_sentences(self, docs, doc_labels): - labels = [] - sentences = [] - for doc, y in izip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, xp.asarray(labels, dtype='i') - - -class DocDataset(TupleDataset): - def __init__(self, nlp, texts, labels): - self.max_length = max_length - DatasetMixin.__init__(self, - get_features( - nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), - labels) - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -def get_features(docs, max_length): - docs = list(docs) - Xs = xp.zeros((len(docs), max_length), dtype='i') - for i, doc in enumerate(docs): - j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.norm - j += 1 - if j >= max_length: - break - return Xs - - -def set_vectors(vectors, vocab): - for lex in vocab: - if lex.has_vector and (lex.rank+1) < vectors.shape[0]: - lex.norm = lex.rank+1 - vectors[lex.rank + 1] = lex.vector - else: - lex.norm = 0 - return vectors - - -def train(train_texts, train_labels, dev_texts, dev_labels, - lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, - by_sentence=True): - nlp = spacy.load('en', entity=False) - if 'nr_vector' not in lstm_shape: - lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) - if 'nr_dim' not in lstm_shape: - lstm_shape['nr_dim'] = nlp.vocab.vectors_length - print("Make model") - model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) - print("Parsing texts...") - if by_sentence: - train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) - dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) - else: - train_data = DocDataset(nlp, train_texts, train_labels) - dev_data = DocDataset(nlp, dev_texts, dev_labels) - train_iter = SerialIterator(train_data, batch_size=batch_size, - shuffle=True, repeat=True) - dev_iter = SerialIterator(dev_data, batch_size=batch_size, - shuffle=False, repeat=False) - optimizer = chainer.optimizers.Adam() - optimizer.setup(model) - updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) - trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') - - trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) - trainer.extend(extensions.LogReport()) - trainer.extend(extensions.PrintReport([ - 'epoch', 'main/accuracy', 'validation/main/accuracy'])) - trainer.extend(extensions.ProgressBar()) - - trainer.run() - - -def evaluate(model_dir, texts, labels, max_length=100): - def create_pipeline(nlp): - ''' - This could be a lambda, but named functions are easier to read in Python. - ''' - return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, - max_length=max_length)] - - nlp = spacy.load('en') - nlp.pipeline = create_pipeline(nlp) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int) -) -def main(model_dir, train_dir, dev_dir, - is_runtime=False, - nr_hidden=64, max_length=100, # Shape - dropout=0.5, learn_rate=0.001, # General NN config - nb_epoch=5, batch_size=32, nr_examples=-1): # Training params - model_dir = pathlib.Path(model_dir) - train_dir = pathlib.Path(train_dir) - dev_dir = pathlib.Path(dev_dir) - if is_runtime: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) - print("Using GPU 0") - #chainer.cuda.get_device(0).use() - train_labels = xp.asarray(train_labels, dtype='i') - dev_labels = xp.asarray(dev_labels, dtype='i') - lstm = train(train_texts, train_labels, dev_texts, dev_labels, - {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, - 'nr_vector': 5000}, - {'dropout': 0.5, 'lr': learn_rate}, - {}, - nb_epoch=nb_epoch, batch_size=batch_size) - - -if __name__ == '__main__': - #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - #s = pstats.Stats("Profile.prof") - #s.strip_dirs().sort_stats("time").print_stats() - plac.call(main) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4eae11c75..666a3bad4 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,8 +24,8 @@ For more details, see the documentation: * Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner * Saving and loading models: https://spacy.io/docs/usage/saving-loading -Developed for: spaCy 1.7.6 -Last tested for: spaCy 1.7.6 +Developed for: spaCy 1.9.0 +Last tested for: spaCy 1.9.0 """ from __future__ import unicode_literals, print_function @@ -52,6 +52,7 @@ def train_ner(nlp, train_data, output_dir): random.shuffle(train_data) loss = 0. for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) # By default, the GoldParse class assumes that the entities # described by offset are complete, and all other words should @@ -63,7 +64,6 @@ def train_ner(nlp, train_data, output_dir): #for i in range(len(gold.ner)): #if not gold.ner[i].endswith('ANIMAL'): # gold.ner[i] = '-' - doc = nlp.make_doc(raw_text) nlp.tagger(doc) # As of 1.9, spaCy's parser now lets you supply a dropout probability # This might help the model generalize better from only a few diff --git a/examples/training/train_tagger_standalone_ud.py b/examples/training/train_tagger_standalone_ud.py new file mode 100644 index 000000000..ce1ab50d6 --- /dev/null +++ b/examples/training/train_tagger_standalone_ud.py @@ -0,0 +1,164 @@ +''' +This example shows training of the POS tagger without the Language class, +showing the APIs of the atomic components. + +This example was adapted from the gist here: + +https://gist.github.com/kamac/a7bc139f62488839a8118214a4d932f2 + +Issue discussing the gist: + +https://github.com/explosion/spaCy/issues/1179 + +The example was written for spaCy 1.8.2. +''' +from __future__ import unicode_literals +from __future__ import print_function + +import plac +import codecs +import spacy.symbols as symbols +import spacy +from pathlib import Path + +from spacy.vocab import Vocab +from spacy.tagger import Tagger +from spacy.tokens import Doc +from spacy.gold import GoldParse +from spacy.language import Language +from spacy import orth +from spacy import attrs + +import random + +TAG_MAP = { + 'ADJ': {symbols.POS: symbols.ADJ}, + 'ADP': {symbols.POS: symbols.ADP}, + 'PUNCT': {symbols.POS: symbols.PUNCT}, + 'ADV': {symbols.POS: symbols.ADV}, + 'AUX': {symbols.POS: symbols.AUX}, + 'SYM': {symbols.POS: symbols.SYM}, + 'INTJ': {symbols.POS: symbols.INTJ}, + 'CCONJ': {symbols.POS: symbols.CCONJ}, + 'X': {symbols.POS: symbols.X}, + 'NOUN': {symbols.POS: symbols.NOUN}, + 'DET': {symbols.POS: symbols.DET}, + 'PROPN': {symbols.POS: symbols.PROPN}, + 'NUM': {symbols.POS: symbols.NUM}, + 'VERB': {symbols.POS: symbols.VERB}, + 'PART': {symbols.POS: symbols.PART}, + 'PRON': {symbols.POS: symbols.PRON}, + 'SCONJ': {symbols.POS: symbols.SCONJ}, +} + +LEX_ATTR_GETTERS = { + attrs.LOWER: lambda string: string.lower(), + attrs.NORM: lambda string: string, + attrs.SHAPE: orth.word_shape, + attrs.PREFIX: lambda string: string[0], + attrs.SUFFIX: lambda string: string[-3:], + attrs.CLUSTER: lambda string: 0, + attrs.IS_ALPHA: orth.is_alpha, + attrs.IS_ASCII: orth.is_ascii, + attrs.IS_DIGIT: lambda string: string.isdigit(), + attrs.IS_LOWER: orth.is_lower, + attrs.IS_PUNCT: orth.is_punct, + attrs.IS_SPACE: lambda string: string.isspace(), + attrs.IS_TITLE: orth.is_title, + attrs.IS_UPPER: orth.is_upper, + attrs.IS_BRACKET: orth.is_bracket, + attrs.IS_QUOTE: orth.is_quote, + attrs.IS_LEFT_PUNCT: orth.is_left_punct, + attrs.IS_RIGHT_PUNCT: orth.is_right_punct, + attrs.LIKE_URL: orth.like_url, + attrs.LIKE_NUM: orth.like_number, + attrs.LIKE_EMAIL: orth.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True +} + + +def read_ud_data(path): + data = [] + last_number = -1 + sentence_words = [] + sentence_tags = [] + with codecs.open(path, encoding="utf-8") as f: + while True: + line = f.readline() + if not line: + break + + if line[0].isdigit(): + d = line.split() + if not "-" in d[0]: + number = int(line[0]) + if number < last_number: + data.append((sentence_words, sentence_tags),) + sentence_words = [] + sentence_tags = [] + sentence_words.append(d[2]) + sentence_tags.append(d[3]) + last_number = number + if len(sentence_words) > 0: + data.append((sentence_words, sentence_tags,)) + return data + +def ensure_dir(path): + if not path.exists(): + path.mkdir() + + +def main(train_loc, dev_loc, output_dir=None): + if output_dir is not None: + output_dir = Path(output_dir) + ensure_dir(output_dir) + ensure_dir(output_dir / "pos") + ensure_dir(output_dir / "vocab") + + train_data = read_ud_data(train_loc) + vocab = Vocab(tag_map=TAG_MAP, lex_attr_getters=LEX_ATTR_GETTERS) + # Populate vocab + for words, _ in train_data: + for word in words: + _ = vocab[word] + + model = spacy.tagger.TaggerModel(spacy.tagger.Tagger.feature_templates) + tagger = Tagger(vocab, model) + print(tagger.tag_names) + for i in range(30): + print("training model (iteration " + str(i) + ")...") + score = 0. + num_samples = 0. + for words, tags in train_data: + doc = Doc(vocab, words=words) + gold = GoldParse(doc, tags=tags) + cost = tagger.update(doc, gold) + for i, word in enumerate(doc): + num_samples += 1 + if word.tag_ == tags[i]: + score += 1 + print('Train acc', score/num_samples) + random.shuffle(train_data) + tagger.model.end_training() + + score = 0.0 + test_data = read_ud_data(dev_loc) + num_samples = 0 + for words, tags in test_data: + doc = Doc(vocab, words) + tagger(doc) + for i, word in enumerate(doc): + num_samples += 1 + if word.tag_ == tags[i]: + score += 1 + print("score: " + str(score / num_samples * 100.0)) + + if output_dir is not None: + tagger.model.dump(str(output_dir / 'pos' / 'model')) + with (output_dir / 'vocab' / 'strings.json').open('w') as file_: + tagger.vocab.strings.dump(file_) + + +if __name__ == '__main__': + plac.call(main) diff --git a/requirements.txt b/requirements.txt index 8194dee58..9d6f34133 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,10 +7,11 @@ thinc>=6.5.0,<6.6.0 murmurhash>=0.26,<0.27 plac<1.0.0,>=0.9.6 six +html5lib==1.0b8 ujson>=1.35 dill>=0.2,<0.3 -requests>=2.13.0,<3.0.0 -regex==2017.4.5 +requests>=2.11.0,<3.0.0 +regex>=2017.4.1,<2017.12.1 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 pip>=9.0.0,<10.0.0 diff --git a/setup.py b/setup.py index 89aaf8eba..1b127962b 100755 --- a/setup.py +++ b/setup.py @@ -203,7 +203,7 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.4.5', + 'regex>=2017.4.1,<2017.12.1', 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/spacy/__init__.py b/spacy/__init__.py index 2308ce7e4..1e5faf504 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,13 +5,15 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info from .glossary import explain +from .about import __version__ -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja,th, ru _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese, + th.Thai, ru.Russian) for _lang in _languages: diff --git a/spacy/about.py b/spacy/about.py index 8c0e0afd3..d34c6f948 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.8.2' +__version__ = '1.9.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index d9aa01734..29447314a 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -276,7 +276,10 @@ for verb_data in [ {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"} + {ORTH: "were", LEMMA: "be"}, + {ORTH: "have"}, + {ORTH: "has", LEMMA: "have"}, + {ORTH: "dare"} ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() diff --git a/spacy/fr/stop_words.py b/spacy/fr/stop_words.py index d9b820537..71f124d6c 100644 --- a/spacy/fr/stop_words.py +++ b/spacy/fr/stop_words.py @@ -86,3 +86,28 @@ votre vous vous-mêmes vu vé vôtre vôtres zut """.split()) + + + +# Number words + +NUM_WORDS = set(""" +zero un deux trois quatre cinq six sept huit neuf dix +onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf +vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante +cent mille mil million milliard billion quadrillion quintillion +sextillion septillion octillion nonillion decillion +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième +onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième +vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième +centième millième millionnième milliardième billionnième quadrillionnième quintillionnième +sextillionnième septillionnième octillionnième nonillionnième decillionnième +""".split()) + + + diff --git a/spacy/glossary.py b/spacy/glossary.py index 4df5264a6..ed1c22c21 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -60,7 +60,7 @@ GLOSSARY = { 'JJR': 'adjective, comparative', 'JJS': 'adjective, superlative', 'LS': 'list item marker', - 'MD': 'verb, modal auxillary', + 'MD': 'verb, modal auxiliary', 'NIL': 'missing tag', 'NN': 'noun, singular or mass', 'NNP': 'noun, proper singular', @@ -91,7 +91,7 @@ GLOSSARY = { 'NFP': 'superfluous punctuation', 'GW': 'additional word in multi-word expression', 'XX': 'unknown', - 'BES': 'auxillary "be"', + 'BES': 'auxiliary "be"', 'HVS': 'forms of "have"', diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 07e40ada6..26e39a593 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -3,21 +3,122 @@ from __future__ import unicode_literals, print_function from os import path -from ..language import Language +from ..language import Language, BaseDefaults +from ..tokenizer import Tokenizer +from ..tagger import Tagger from ..attrs import LANG from ..tokens import Doc from .language_data import * +import re +from collections import namedtuple + +ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) + +DETAILS_KEY = 'mecab_details' + +def try_mecab_import(): + """Mecab is required for Japanese support, so check for it. + + It it's not available blow up and explain how to fix it.""" + try: + import MeCab + return MeCab + except ImportError: + raise ImportError("Japanese support requires MeCab: " + "https://github.com/SamuraiT/mecab-python3") + +class JapaneseTokenizer(object): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + MeCab = try_mecab_import() + self.tokenizer = MeCab.Tagger() + + def __call__(self, text): + dtokens = detailed_tokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + # stash details tokens for tagger to use + doc.user_data[DETAILS_KEY] = dtokens + return doc + +def resolve_pos(token): + """If necessary, add a field to the POS tag for UD mapping. + + Under Universal Dependencies, sometimes the same Unidic POS tag can + be mapped differently depending on the literal token or its context + in the sentence. This function adds information to the POS tag to + resolve ambiguous mappings. + """ + + # NOTE: This is a first take. The rules here are crude approximations. + # For many of these, full dependencies are needed to properly resolve + # PoS mappings. + + if token.part_of_speech == '連体詞,*,*,*': + if re.match('^[こそあど此其彼]の', token.surface): + return token.part_of_speech + ',DET' + if re.match('^[こそあど此其彼]', token.surface): + return token.part_of_speech + ',PRON' + else: + return token.part_of_speech + ',ADJ' + return token.part_of_speech + +def detailed_tokens(tokenizer, text): + """Format Mecab output into a nice data structure, based on Janome.""" + + node = tokenizer.parseToNode(text) + node = node.next # first node is beginning of sentence and empty, skip it + words = [] + while node.posid != 0: + surface = node.surface + base = surface + parts = node.feature.split(',') + pos = ','.join(parts[0:4]) + + if len(parts) > 6: + # this information is only available for words in the tokenizer dictionary + reading = parts[6] + base = parts[7] + + words.append( ShortUnitWord(surface, base, pos) ) + node = node.next + return words + +class JapaneseTagger(object): + def __init__(self, vocab): + MeCab = try_mecab_import() + self.tagger = Tagger(vocab) + self.tokenizer = MeCab.Tagger() + + def __call__(self, tokens): + # two parts to this: + # 1. get raw JP tags + # 2. add features to tags as necessary for UD + + dtokens = tokens.user_data[DETAILS_KEY] + rawtags = list(map(resolve_pos, dtokens)) + self.tagger.tag_from_strings(tokens, rawtags) + +class JapaneseDefaults(BaseDefaults): + tag_map = TAG_MAP + + @classmethod + def create_tokenizer(cls, nlp=None): + return JapaneseTokenizer(cls, nlp) + + @classmethod + def create_tagger(cls, tokenizer): + return JapaneseTagger(tokenizer.vocab) class Japanese(Language): lang = 'ja' + Defaults = JapaneseDefaults + def make_doc(self, text): - try: - from janome.tokenizer import Tokenizer - except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: " - "https://github.com/mocobeta/janome") - words = [x.surface for x in Tokenizer().tokenize(text)] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + jdoc = self.tokenizer(text) + tagger = JapaneseDefaults.create_tagger(self.tokenizer) + tagger(jdoc) + return jdoc diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index f5b6b5040..191865ed2 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -3,22 +3,86 @@ from __future__ import unicode_literals from ..symbols import * - TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} + # Explanation of Unidic tags: + # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf + + # Universal Dependencies Mapping: + # http://universaldependencies.org/ja/overview/morphology.html + # http://universaldependencies.org/ja/pos/all.html + + "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ + "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math + + "感動詞,フィラー,*,*": {POS: INTJ}, + "感動詞,一般,*,*": {POS: INTJ}, + + # this is specifically for unicode full-width space + "空白,*,*,*": {POS: X}, + + "形状詞,一般,*,*":{POS: ADJ}, + "形状詞,タリ,*,*":{POS: ADJ}, + "形状詞,助動詞語幹,*,*":{POS: ADJ}, + "形容詞,一般,*,*":{POS: ADJ}, + "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise + + "助詞,格助詞,*,*":{POS: ADP}, + "助詞,係助詞,*,*":{POS: ADP}, + "助詞,終助詞,*,*":{POS: PART}, + "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い + "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て + "助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb + "助動詞,*,*,*":{POS: AUX}, + "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement + + "接頭辞,*,*,*":{POS: NOUN}, + "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック + "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい + "接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ + "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞,名詞的,一般,*":{POS: NOUN}, + "接尾辞,名詞的,助数詞,*":{POS: NOUN}, + "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ + + "代名詞,*,*,*":{POS: PRON}, + "動詞,一般,*,*":{POS: VERB}, + "動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise + "動詞,非自立可能,*,*,AUX":{POS: AUX}, + "動詞,非自立可能,*,*,VERB":{POS: VERB}, + "副詞,*,*,*":{POS: ADV}, + + "補助記号,AA,一般,*":{POS: SYM}, # text art + "補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji + "補助記号,一般,*,*":{POS: SYM}, + "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket + "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket + "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker + "補助記号,読点,*,*":{POS: PUNCT}, # comma + + "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun + "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name + "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname + "名詞,固有名詞,人名,名":{POS: PROPN}, # first name + "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name + "名詞,固有名詞,地名,国":{POS: PROPN}, # country name + + "名詞,助動詞語幹,*,*":{POS: AUX}, + "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals + + "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, + + "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手 + "名詞,普通名詞,一般,*":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2 + "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, + "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit + "名詞,普通名詞,副詞可能,*":{POS: NOUN}, + + "連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token + "連体詞,*,*,*,ADJ":{POS: ADJ}, + "連体詞,*,*,*,PRON":{POS: PRON}, + "連体詞,*,*,*,DET":{POS: DET}, } diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index f23b15bbc..3b5307496 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -19,22 +19,24 @@ _CURRENCY = r""" _QUOTES = r""" ' '' " ” “ `` ` ‘ ´ ‚ , „ » « +「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 """ _PUNCT = r""" … , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & +。 ? ! , 、 ; : ~ · """ _HYPHENS = r""" -- – — -- --- +- – — -- --- —— ~ """ LIST_ELLIPSES = [ r'\.\.+', - "…" + "… ……" ] diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index ead6dd1c6..65dab9b0d 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -22,5 +22,6 @@ TAG_MAP = { "CCONJ": {POS: CCONJ}, # U20 "ADJ": {POS: ADJ}, "VERB": {POS: VERB}, - "PART": {POS: PART} + "PART": {POS: PART}, + 'SP': {POS: SPACE} } diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index b84adb2c4..9d5187d83 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -32,11 +32,11 @@ _URL_PATTERN = ( r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name - r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" + r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # domain name - r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" + r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*" # TLD identifier - r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" + r"(?:\.(?:[a-z]{2,}))" r")" # port number r"(?::\d{2,5})?" diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7541c56b..1112bcee3 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -78,15 +78,16 @@ def lemmatize(string, index, exceptions, rules): # forms.append(string) forms.extend(exceptions.get(string, [])) oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 05d8bddc6..dc0440486 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -159,6 +159,10 @@ cdef class Lexeme: def __get__(self): return self.c.id + property lex_id: + def __get__(self): + return self.c.id + property repvec: def __get__(self): raise AttributeError("lex.repvec has been renamed to lex.vector") @@ -173,6 +177,11 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.orth] + property text: + def __get__(self): + return self.vocab.strings[self.c.orth] + + property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index f9899d8d1..b3ca1aef9 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .. import language_data as base from ..language_data import update_exc, strings_to_exc -from .stop_words import STOP_WORDS +from .word_sets import STOP_WORDS, NUM_WORDS STOP_WORDS = set(STOP_WORDS) diff --git a/spacy/nl/stop_words.py b/spacy/nl/word_sets.py similarity index 52% rename from spacy/nl/stop_words.py rename to spacy/nl/word_sets.py index 22f1d714c..d19515262 100644 --- a/spacy/nl/stop_words.py +++ b/spacy/nl/word_sets.py @@ -41,3 +41,22 @@ want waren was wat we wel werd wezen wie wij wil worden zal ze zei zelf zich zij zijn zo zonder zou """.split()) + + +# Number words + +NUM_WORDS = set(""" +nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien +veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd +duizend miljoen miljard biljoen biljard triljoen triljard +""".split()) + + +# Ordinal words + +ORDINAL_WORDS = set(""" +eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde +twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste +zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste +miljardste biljoenste biljardste triljoenste triljardste +""".split()) diff --git a/spacy/ru/__init__.py b/spacy/ru/__init__.py new file mode 100644 index 000000000..8789cd6e5 --- /dev/null +++ b/spacy/ru/__init__.py @@ -0,0 +1,78 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc +from .language_data import * + + +class RussianTokenizer(object): + _morph = None + + def __init__(self, spacy_tokenizer, cls, nlp=None): + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian tokenizer requires the pymorphy2 library: " + "try to fix it with " + "pip install pymorphy2==0.8") + + RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer) + + self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) + self._spacy_tokenizer = spacy_tokenizer + + def __call__(self, text): + get_norm = RussianTokenizer._get_norm + has_space = RussianTokenizer._has_space + + words_with_space_flags = [(get_norm(token), has_space(token, text)) + for token in self._spacy_tokenizer(text)] + + words, spaces = map(lambda s: list(s), zip(*words_with_space_flags)) + + return Doc(self.vocab, words, spaces) + + @staticmethod + def _get_word(token): + return token.lemma_ if len(token.lemma_) > 0 else token.text + + @staticmethod + def _has_space(token, text): + pos_after_token = token.idx + len(token.text) + return pos_after_token < len(text) and text[pos_after_token] == ' ' + + @classmethod + def _get_norm(cls, token): + return cls._normalize(cls._get_word(token)) + + @classmethod + def _normalize(cls, word): + return cls._morph.parse(word)[0].normal_form + + @classmethod + def _create_morph(cls, morph_analyzer_class): + if not cls._morph: + cls._morph = morph_analyzer_class() + return cls._morph + + +class RussianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ru' + + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + + @classmethod + def create_tokenizer(cls, nlp=None): + tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp) + return RussianTokenizer(tokenizer, cls, nlp) + + +class Russian(Language): + lang = 'ru' + + Defaults = RussianDefaults diff --git a/spacy/ru/language_data.py b/spacy/ru/language_data.py new file mode 100644 index 000000000..d33d388fd --- /dev/null +++ b/spacy/ru/language_data.py @@ -0,0 +1,18 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + + +__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] diff --git a/spacy/ru/stop_words.py b/spacy/ru/stop_words.py new file mode 100644 index 000000000..2d89b7726 --- /dev/null +++ b/spacy/ru/stop_words.py @@ -0,0 +1,54 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +а + +будем будет будете будешь буду будут будучи будь будьте бы был была были было +быть + +в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею +всея всю вся вы + +да для до + +его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею + +же + +за + +и из или им ими имъ их + +к как кем ко когда кого ком кому комья которая которого которое которой котором +которому которою которую которые который которым которыми которых кто + +меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего +моей моем моём моему моею можем может можете можешь мои мой моим моими моих +мочь мою моя мы + +на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим +нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но + +о об один одна одни одним одними одних одно одного одной одном одному одною +одну он она оне они оно от + +по при + +с сам сама сами самим самими самих само самого самом самому саму свое своё +своего своей своем своём своему своею свои свой своим своими своих свою своя +себе себя собой собою + +та так такая такие таким такими таких такого такое такой таком такому такою +такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому +тот тою ту ты + +у уже + +чего чем чём чему что чтобы + +эта эти этим этими этих это этого этой этом этому этот этою эту + +я +""".split()) diff --git a/spacy/ru/tokenizer_exceptions.py b/spacy/ru/tokenizer_exceptions.py new file mode 100644 index 000000000..f444f3df6 --- /dev/null +++ b/spacy/ru/tokenizer_exceptions.py @@ -0,0 +1,30 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TOKENIZER_EXCEPTIONS = { + "Пн.": [ + {ORTH: "Пн.", LEMMA: "Понедельник"} + ], + "Вт.": [ + {ORTH: "Вт.", LEMMA: "Вторник"} + ], + "Ср.": [ + {ORTH: "Ср.", LEMMA: "Среда"} + ], + "Чт.": [ + {ORTH: "Чт.", LEMMA: "Четверг"} + ], + "Пт.": [ + {ORTH: "Пт.", LEMMA: "Пятница"} + ], + "Сб.": [ + {ORTH: "Сб.", LEMMA: "Суббота"} + ], + "Вс.": [ + {ORTH: "Вс.", LEMMA: "Воскресенье"} + ], +} + diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index b0d1c78ca..14dba5f9b 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -9,7 +9,7 @@ def english_noun_chunks(obj): Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos', 'attr', 'ROOT'] doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings[label] for label in labels] @@ -117,4 +117,5 @@ def es_noun_chunks(obj): token = next_token(token) -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks} +CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks, + None: english_noun_chunks, '': english_noun_chunks} diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b9de1e114..48edb6d22 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -147,6 +147,9 @@ cdef class Parser: # TODO: remove this shim when we don't have to support older data if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') + # Convert string keys to int + if cfg.get('actions'): + cfg['actions'] = {int(action_name): labels for action_name, labels in cfg['actions'].items()} # TODO: remove this shim when we don't have to support older data for action_name, labels in dict(cfg.get('actions', {})).items(): # We need this to be sorted diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b8ada1d9a..de0facf49 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -5,6 +5,7 @@ from ..en import English from ..de import German from ..es import Spanish from ..it import Italian +from ..ja import Japanese from ..fr import French from ..pt import Portuguese from ..nl import Dutch @@ -14,7 +15,8 @@ from ..fi import Finnish from ..bn import Bengali from ..he import Hebrew from ..nb import Norwegian - +from ..th import Thai +from ..ru import Russian from ..tokens import Doc from ..strings import StringStore @@ -26,7 +28,7 @@ from pathlib import Path import os import pytest - +# These languages get run through generic tokenizer tests LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, Swedish, Hungarian, Finnish, Bengali, Norwegian] @@ -51,6 +53,7 @@ def en_vocab(): def en_parser(): return English.Defaults.create_parser() + @pytest.fixture def es_tokenizer(): return Spanish.Defaults.create_tokenizer() @@ -76,6 +79,18 @@ def fi_tokenizer(): return Finnish.Defaults.create_tokenizer() +@pytest.fixture +def ja_tokenizer(): + pytest.importorskip("MeCab") + return Japanese.Defaults.create_tokenizer() + + +@pytest.fixture +def japanese(): + pytest.importorskip("MeCab") + return Japanese() + + @pytest.fixture def sv_tokenizer(): return Swedish.Defaults.create_tokenizer() @@ -90,10 +105,30 @@ def bn_tokenizer(): def he_tokenizer(): return Hebrew.Defaults.create_tokenizer() + @pytest.fixture def nb_tokenizer(): return Norwegian.Defaults.create_tokenizer() + +@pytest.fixture +def th_tokenizer(): + pythainlp = pytest.importorskip("pythainlp") + return Thai.Defaults.create_tokenizer() + + +@pytest.fixture +def ru_tokenizer(): + pytest.importorskip("pymorphy2") + return Russian.Defaults.create_tokenizer() + + +@pytest.fixture +def russian(): + pytest.importorskip("pymorphy2") + return Russian() + + @pytest.fixture def stringstore(): return StringStore() @@ -101,7 +136,7 @@ def stringstore(): @pytest.fixture def en_entityrecognizer(): - return English.Defaults.create_entity() + return English.Defaults.create_entity() @pytest.fixture @@ -113,6 +148,7 @@ def lemmatizer(): def text_file(): return StringIO() + @pytest.fixture def text_file_b(): return BytesIO() @@ -132,11 +168,11 @@ def DE(): def pytest_addoption(parser): parser.addoption("--models", action="store_true", - help="include tests that require full models") + help="include tests that require full models") parser.addoption("--vectors", action="store_true", - help="include word vectors tests") + help="include word vectors tests") parser.addoption("--slow", action="store_true", - help="include slow tests") + help="include slow tests") def pytest_runtest_setup(item): diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 1bc534ecd..d1a6316d5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -216,6 +216,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): doc = en_tokenizer(text) assert doc.has_vector +def test_lowest_common_ancestor(en_tokenizer): + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc.get_lca_matrix() + assert(lca[1, 1] == 1) + assert(lca[0, 1] == 2) + assert(lca[1, 2] == 2) def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" diff --git a/spacy/tests/ja/__init__.py b/spacy/tests/ja/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py new file mode 100644 index 000000000..85f653836 --- /dev/null +++ b/spacy/tests/ja/test_tagger.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +TAGGER_TESTS = [ + ('あれならそこにあるよ', + (('代名詞,*,*,*', 'PRON'), + ('助動詞,*,*,*', 'AUX'), + ('代名詞,*,*,*', 'PRON'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助詞,終助詞,*,*', 'PART'))), + ('このファイルには小さなテストが入っているよ', + (('連体詞,*,*,*,DET', 'DET'), + ('名詞,普通名詞,サ変可能,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('助詞,係助詞,*,*', 'ADP'), + ('連体詞,*,*,*,ADJ', 'ADJ'), + ('名詞,普通名詞,サ変可能,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,一般,*,*', 'VERB'), + ('助詞,接続助詞,*,*', 'SCONJ'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助詞,終助詞,*,*', 'PART'))), + ('プププランドに行きたい', + (('名詞,普通名詞,一般,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助動詞,*,*,*', 'AUX'))) +] + +@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS) +def test_japanese_tagger(japanese, text, expected_tags): + tokens = japanese.make_doc(text) + assert len(tokens) == len(expected_tags) + for token, res in zip(tokens, expected_tags): + assert token.tag_ == res[0] and token.pos_ == res[1] diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py new file mode 100644 index 000000000..17411aee2 --- /dev/null +++ b/spacy/tests/ja/test_tokenizer.py @@ -0,0 +1,17 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +TOKENIZER_TESTS = [ + ("日本語だよ", ['日本', '語', 'だ', 'よ']), + ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), + ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']), + ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']), + ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens): + tokens = [token.text for token in ja_tokenizer(text)] + assert tokens == expected_tokens diff --git a/spacy/tests/parser/test_noun_chunks.py b/spacy/tests/parser/test_noun_chunks.py index 5e8c7659a..ddebca8b8 100644 --- a/spacy/tests/parser/test_noun_chunks.py +++ b/spacy/tests/parser/test_noun_chunks.py @@ -47,6 +47,36 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer): assert chunks[1].text_with_ws == "another phrase " +def test_parser_noun_chunks_appositional_modifiers(en_tokenizer): + text = "Sam, my brother, arrived to the house." + heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4] + tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.'] + deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "Sam " + assert chunks[1].text_with_ws == "my brother " + assert chunks[2].text_with_ws == "the house " + + +def test_parser_noun_chunks_dative(en_tokenizer): + text = "She gave Bob a raise." + heads = [1, 0, -1, 1, -3, -4] + tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.'] + deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "She " + assert chunks[1].text_with_ws == "Bob " + assert chunks[2].text_with_ws == "a raise " + + def test_parser_noun_chunks_standard_de(de_tokenizer): text = "Eine Tasse steht auf dem Tisch." heads = [1, 1, 0, -1, 1, -2, -4] diff --git a/spacy/tests/regression/test_issue1031.py b/spacy/tests/regression/test_issue1031.py new file mode 100644 index 000000000..1ac14eb7b --- /dev/null +++ b/spacy/tests/regression/test_issue1031.py @@ -0,0 +1,13 @@ +from ...vocab import Vocab + +def test_lexeme_text(): + vocab = Vocab() + lex = vocab[u'the'] + assert lex.text == u'the' + + +def test_lexeme_lex_id(): + vocab = Vocab() + lex1 = vocab[u'the'] + lex2 = vocab[u'be'] + assert lex1.lex_id != lex2.lex_id diff --git a/spacy/tests/regression/test_issue1061.py b/spacy/tests/regression/test_issue1061.py new file mode 100644 index 000000000..821ca2bfc --- /dev/null +++ b/spacy/tests/regression/test_issue1061.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals + +from ...symbols import ORTH + +from ...vocab import Vocab +from ...en import English + + +def test_issue1061(): + '''Test special-case works after tokenizing. Was caching problem.''' + text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.' + tokenizer = English.Defaults.create_tokenizer() + doc = tokenizer(text) + assert 'MATH' in [w.text for w in doc] + assert '_MATH_' not in [w.text for w in doc] + + tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) + doc = tokenizer(text) + assert '_MATH_' in [w.text for w in doc] + assert 'MATH' not in [w.text for w in doc] + + # For sanity, check it works when pipeline is clean. + tokenizer = English.Defaults.create_tokenizer() + tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}]) + doc = tokenizer(text) + assert '_MATH_' in [w.text for w in doc] + assert 'MATH' not in [w.text for w in doc] diff --git a/spacy/tests/regression/test_issue1207.py b/spacy/tests/regression/test_issue1207.py new file mode 100644 index 000000000..a71faebcb --- /dev/null +++ b/spacy/tests/regression/test_issue1207.py @@ -0,0 +1,25 @@ +from __future__ import unicode_literals +from ..util import get_doc +from ...vocab import Vocab +from ...en import English + + +def test_span_noun_chunks(): + vocab = Vocab(lang='en', tag_map=English.Defaults.tag_map) + words = "Employees are recruiting talented staffers from overseas .".split() + heads = [1, 1, 0, 1, -2, -1, -5] + deps = ['nsubj', 'aux', 'ROOT', 'nmod', 'dobj', 'adv', 'pobj'] + tags = ['NNS', 'VBP', 'VBG', 'JJ', 'NNS', 'IN', 'NN', '.'] + doc = get_doc(vocab, words=words, heads=heads, deps=deps, tags=tags) + doc.is_parsed = True + + noun_chunks = [np.text for np in doc.noun_chunks] + assert noun_chunks == ['Employees', 'talented staffers', 'overseas'] + + span = doc[0:4] + noun_chunks = [np.text for np in span.noun_chunks] + assert noun_chunks == ['Employees'] + + for sent in doc.sents: + noun_chunks = [np.text for np in sent.noun_chunks] + assert noun_chunks == ['Employees', 'talented staffers', 'overseas'] diff --git a/spacy/tests/regression/test_issue1281.py b/spacy/tests/regression/test_issue1281.py new file mode 100644 index 000000000..17307b1d6 --- /dev/null +++ b/spacy/tests/regression/test_issue1281.py @@ -0,0 +1,13 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', [ + "She hasn't done the housework.", + "I haven't done it before.", + "you daren't do that"]) +def test_issue1281(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[2].text == "n't" diff --git a/spacy/tests/regression/test_issue1387.py b/spacy/tests/regression/test_issue1387.py new file mode 100644 index 000000000..c5f01d145 --- /dev/null +++ b/spacy/tests/regression/test_issue1387.py @@ -0,0 +1,22 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc + +import pytest + +def test_issue1387(): + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + diff --git a/spacy/tests/regression/test_issue693.py b/spacy/tests/regression/test_issue693.py index e4d907716..5deeb3215 100644 --- a/spacy/tests/regression/test_issue693.py +++ b/spacy/tests/regression/test_issue693.py @@ -14,7 +14,5 @@ def test_issue693(EN): doc2 = EN(text2) chunks1 = [chunk for chunk in doc1.noun_chunks] chunks2 = [chunk for chunk in doc2.noun_chunks] - for word in doc1: - print(word.text, word.dep_, word.head.text) assert len(chunks1) == 2 assert len(chunks2) == 2 diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py index 633e96fb5..108b434a2 100644 --- a/spacy/tests/regression/test_issue995.py +++ b/spacy/tests/regression/test_issue995.py @@ -15,7 +15,6 @@ def test_issue955(doc): '''Test that we don't have any nested noun chunks''' seen_tokens = set() for np in doc.noun_chunks: - print(np.text, np.root.text, np.root.dep_, np.root.tag_) for word in np: key = (word.i, word.text) assert key not in seen_tokens diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index d22fa52ae..29aefe5c7 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -54,6 +54,17 @@ def test_spans_span_sent(doc): assert doc[6:7].sent.root.left_edge.text == 'This' +def test_spans_lca_matrix(en_tokenizer): + """Test span's lca matrix generation""" + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc[:2].get_lca_matrix() + assert(lca[0, 0] == 0) + assert(lca[0, 1] == -1) + assert(lca[1, 0] == -1) + assert(lca[1, 1] == 1) + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 5db0d0b2c..91ed7d2f1 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -47,3 +47,20 @@ def test_tagger_lemmatizer_lemma_assignment(EN): assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) assert all(t.lemma_ != '' for t in doc) + + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc +def test_tagger_lemmatizer_exceptions(): + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" diff --git a/spacy/tests/th/test_tokenizer.py b/spacy/tests/th/test_tokenizer.py new file mode 100644 index 000000000..851c6f067 --- /dev/null +++ b/spacy/tests/th/test_tokenizer.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +TOKENIZER_TESTS = [ + ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_thai_tokenizer(th_tokenizer, text, expected_tokens): + tokens = [token.text for token in th_tokenizer(text)] + assert tokens == expected_tokens diff --git a/spacy/tests/tokenizer/test_customized_tokenizer.py b/spacy/tests/tokenizer/test_customized_tokenizer.py new file mode 100644 index 000000000..855f3386c --- /dev/null +++ b/spacy/tests/tokenizer/test_customized_tokenizer.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...en import English +from ...tokenizer import Tokenizer +from ... import util + +import pytest + +@pytest.fixture +def tokenizer(en_vocab): + prefix_re = util.compile_prefix_regex(English.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(English.Defaults.suffixes) + custom_infixes = ['\.\.\.+', + '(?<=[0-9])-(?=[0-9])', + # '(?<=[0-9]+),(?=[0-9]+)', + '[0-9]+(,[0-9]+)+', + u'[\[\]!&:,()\*—–\/-]'] + + infix_re = util.compile_infix_regex(custom_infixes) + return Tokenizer(en_vocab, + English.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None) + +def test_customized_tokenizer_handles_infixes(tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in tokenizer(sentence)] + assert context == [u'The', u'8', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', + u'for', + u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] + + # the trailing '-' may cause Assertion Error + sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in tokenizer(sentence)] + assert context == [u'The', u'8', u'-', u'and', u'10', u'-', u'county', u'definitions', u'are', u'not', u'used', + u'for', + u'the', u'greater', u'Southern', u'California', u'Megaregion', u'.'] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 959067110..3bb6521f1 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", - "http://⌘.ws", - "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(✪)_in_parens", "http://foo.com/(something)?after=parens", - "http://☺.damowmow.com/", "http://code.google.com/events/#&product=browser", "http://j.mp", "ftp://foo.bar/baz", @@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [ "http://a.b-c.de", "http://223.255.255.254", "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 - "http://✪df.ws/123", - "http://➡.ws/䨹", - "http://مثال.إختبار", - "http://例子.测试", - "http://उदाहरण.परीक्षा", pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), + pytest.mark.xfail("http://⌘.ws"), + pytest.mark.xfail("http://⌘.ws/"), + pytest.mark.xfail("http://☺.damowmow.com/"), + pytest.mark.xfail("http://✪df.ws/123"), + pytest.mark.xfail("http://➡.ws/䨹"), + pytest.mark.xfail("http://مثال.إختبار"), + pytest.mark.xfail("http://例子.测试"), + pytest.mark.xfail("http://उदाहरण.परीक्षा"), ] URLS_SHOULD_NOT_MATCH = [ @@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", - "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://10.1.1.0", @@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [ pytest.mark.xfail("foo.com"), pytest.mark.xfail("http://1.1.1.1.1"), pytest.mark.xfail("http://www.foo.bar./"), + pytest.mark.xfail("http://-a.b.co"), ] diff --git a/spacy/th/__init__.py b/spacy/th/__init__.py new file mode 100644 index 000000000..0ed5268c6 --- /dev/null +++ b/spacy/th/__init__.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .language_data import * +from ..language import Language, BaseDefaults +from ..attrs import LANG +from ..tokenizer import Tokenizer +from ..tokens import Doc +class ThaiDefaults(BaseDefaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'th' + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tag_map = TAG_MAP + stop_words = set(STOP_WORDS) + + +class Thai(Language): + lang = 'th' + Defaults = ThaiDefaults + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) \ No newline at end of file diff --git a/spacy/th/language_data.py b/spacy/th/language_data.py new file mode 100644 index 000000000..03800ba19 --- /dev/null +++ b/spacy/th/language_data.py @@ -0,0 +1,25 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +#from .tag_map import TAG_MAP +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) + +# export __all__ = ["TAG_MAP", "STOP_WORDS"] +__all__ = ["TAG_MAP", "STOP_WORDS","TOKENIZER_EXCEPTIONS"] \ No newline at end of file diff --git a/spacy/th/stop_words.py b/spacy/th/stop_words.py new file mode 100644 index 000000000..e13dec984 --- /dev/null +++ b/spacy/th/stop_words.py @@ -0,0 +1,62 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt +# stop words as whitespace-separated list +STOP_WORDS = set(""" +นี้ นํา นั้น นัก นอกจาก ทุก ที่สุด ที่ ทําให้ ทํา ทาง ทั้งนี้ ดัง ซึ่ง ช่วง จาก จัด จะ คือ ความ ครั้ง คง ขึ้น ของ +ขอ รับ ระหว่าง รวม ยัง มี มาก มา พร้อม พบ ผ่าน ผล บาง น่า เปิดเผย เปิด เนื่องจาก เดียวกัน เดียว เช่น เฉพาะ เข้า ถ้า +ถูก ถึง ต้อง ต่างๆ ต่าง ต่อ ตาม ตั้งแต่ ตั้ง ด้าน ด้วย อีก อาจ ออก อย่าง อะไร อยู่ อยาก หาก หลาย หลังจาก แต่ เอง เห็น +เลย เริ่ม เรา เมื่อ เพื่อ เพราะ เป็นการ เป็น หลัง หรือ หนึ่ง ส่วน ส่ง สุด สําหรับ ว่า ลง ร่วม ราย ขณะ ก่อน ก็ การ กับ กัน +กว่า กล่าว จึง ไว้ ไป ได้ ให้ ใน โดย แห่ง แล้ว และ แรก แบบ ๆ ทั้ง วัน เขา เคย ไม่ อยาก เกิน เกินๆ เกี่ยวกัน เกี่ยวกับ +เกี่ยวข้อง เกี่ยวเนื่อง เกี่ยวๆ เกือบ เกือบจะ เกือบๆ แก แก่ แก้ไข ใกล้ ใกล้ๆ ไกล ไกลๆ ขณะเดียวกัน ขณะใด ขณะใดๆ ขณะที่ ขณะนั้น ขณะนี้ ขณะหนึ่ง ขวาง +ขวางๆ ขั้น ใคร ใคร่ ใคร่จะ ใครๆ ง่าย ง่ายๆ ไง จง จด จน จนกระทั่ง จนกว่า จนขณะนี้ จนตลอด จนถึง จนทั่ว จนบัดนี้ จนเมื่อ จนแม้ จนแม้น +จรด จรดกับ จริง จริงจัง จริงๆ จริงๆจังๆ จวน จวนจะ จวนเจียน จวบ ซึ่งก็ ซึ่งก็คือ ซึ่งกัน ซึ่งกันและกัน ซึ่งได้แก่ ซึ่งๆ ณ ด้วย ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยประการฉะนี้ +ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดั่ง ดังกล่าว ดังกับ ดั่งกับ ดังกับว่า ดั่งกับว่า ดังเก่า +ดั่งเก่า ดังเคย ใดๆ ได้ ได้แก่ ได้แต่ ได้ที่ ได้มา ได้รับ ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถึง ตลอดทั้ง +ตลอดทั่ว ตลอดทั่วถึง ตลอดทั่วทั้ง ตลอดปี ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดวัน ตลอดเวลา ตลอดศก ต่อ ต่อกัน ถึงแก่ ถึงจะ ถึงบัดนั้น ถึงบัดนี้ +ถึงเมื่อ ถึงเมื่อใด ถึงเมื่อไร ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือ ถือว่า ถูกต้อง ถูกๆ เถอะ เถิด ทรง ทว่า ทั้งคน ทั้งตัว ทั้งที ทั้งที่ ทั้งนั้น ทั้งนั้นด้วย ทั้งนั้นเพราะ +นอก นอกจากที่ นอกจากนั้น นอกจากนี้ นอกจากว่า นอกนั้น นอกเหนือ นอกเหนือจาก น้อย น้อยกว่า น้อยๆ นะ น่ะ นักๆ นั่น นั่นไง นั่นเป็น นั่นแหละ +นั่นเอง นั้นๆ นับ นับจากนั้น นับจากนี้ นับตั้งแต่ นับแต่ นับแต่ที่ นับแต่นั้น เป็นต้น เป็นต้นไป เป็นต้นมา เป็นแต่ เป็นแต่เพียง เป็นที เป็นที่ เป็นที่สุด เป็นเพราะ +เป็นเพราะว่า เป็นเพียง เป็นเพียงว่า เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอันว่า เป็นอันๆ เป็นอาทิ เป็นๆ เปลี่ยน เปลี่ยนแปลง เปิด เปิดเผย ไป่ ผ่าน ผ่านๆ +ผิด ผิดๆ ผู้ เพียงเพื่อ เพียงไร เพียงไหน เพื่อที่ เพื่อที่จะ เพื่อว่า เพื่อให้ ภาค ภาคฯ ภาย ภายใต้ ภายนอก ภายใน ภายภาค ภายภาคหน้า ภายหน้า ภายหลัง +มอง มองว่า มัก มักจะ มัน มันๆ มั้ย มั้ยนะ มั้ยนั่น มั้ยเนี่ย มั้ยล่ะ ยืนนาน ยืนยง ยืนยัน ยืนยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร็ว ร่วม รวมกัน ร่วมกัน +รวมด้วย ร่วมด้วย รวมถึง รวมทั้ง ร่วมมือ รวมๆ ระยะ ระยะๆ ระหว่าง รับรอง รึ รึว่า รือ รือว่า สิ้นกาลนาน สืบเนื่อง สุดๆ สู่ สูง สูงกว่า สูงส่ง สูงสุด สูงๆ เสมือนกับ +เสมือนว่า เสร็จ เสร็จกัน เสร็จแล้ว เสร็จสมบูรณ์ เสร็จสิ้น เสีย เสียก่อน เสียจน เสียจนกระทั่ง เสียจนถึง เสียด้วย เสียนั่น เสียนั่นเอง เสียนี่ เสียนี่กระไร เสียยิ่ง +เสียยิ่งนัก เสียแล้ว ใหญ่ๆ ให้ดี ให้แด่ ให้ไป ใหม่ ให้มา ใหม่ๆ ไหน ไหนๆ อดีต อนึ่ง อย่าง อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น +อย่างนี้ อย่างโน้น ก็คือ ก็แค่ ก็จะ ก็ดี ก็ได้ ก็ต่อเมื่อ ก็ตาม ก็ตามแต่ ก็ตามที ก็แล้วแต่ กระทั่ง กระทำ กระนั้น กระผม กลับ กล่าวคือ กลุ่ม กลุ่มก้อน +กลุ่มๆ กว้าง กว้างขวาง กว้างๆ ก่อนหน้า ก่อนหน้านี้ ก่อนๆ กันดีกว่า กันดีไหม กันเถอะ กันนะ กันและกัน กันไหม กันเอง กำลัง กำลังจะ กำหนด กู เก็บ +เกิด เกี่ยวข้อง แก่ แก้ไข ใกล้ ใกล้ๆ ข้า ข้าง ข้างเคียง ข้างต้น ข้างบน ข้างล่าง ข้างๆ ขาด ข้าพเจ้า ข้าฯ เข้าใจ เขียน คงจะ คงอยู่ ครบ ครบครัน ครบถ้วน +ครั้งกระนั้น ครั้งก่อน ครั้งครา ครั้งคราว ครั้งใด ครั้งที่ ครั้งนั้น ครั้งนี้ ครั้งละ ครั้งหนึ่ง ครั้งหลัง ครั้งหลังสุด ครั้งไหน ครั้งๆ ครัน ครับ ครา คราใด คราที่ ครานั้น ครานี้ คราหนึ่ง +คราไหน คราว คราวก่อน คราวใด คราวที่ คราวนั้น คราวนี้ คราวโน้น คราวละ คราวหน้า คราวหนึ่ง คราวหลัง คราวไหน คราวๆ คล้าย คล้ายกัน คล้ายกันกับ +คล้ายกับ คล้ายกับว่า คล้ายว่า ควร ค่อน ค่อนข้าง ค่อนข้างจะ ค่อยไปทาง ค่อนมาทาง ค่อย ค่อยๆ คะ ค่ะ คำ คิด คิดว่า คุณ คุณๆ +เคยๆ แค่ แค่จะ แค่นั้น แค่นี้ แค่เพียง แค่ว่า แค่ไหน ใคร่ ใคร่จะ ง่าย ง่ายๆ จนกว่า จนแม้ จนแม้น จังๆ จวบกับ จวบจน จ้ะ จ๊ะ จะได้ จัง จัดการ จัดงาน จัดแจง +จัดตั้ง จัดทำ จัดหา จัดให้ จับ จ้า จ๋า จากนั้น จากนี้ จากนี้ไป จำ จำเป็น จำพวก จึงจะ จึงเป็น จู่ๆ ฉะนั้น ฉะนี้ ฉัน เฉกเช่น เฉย เฉยๆ ไฉน ช่วงก่อน +ช่วงต่อไป ช่วงถัดไป ช่วงท้าย ช่วงที่ ช่วงนั้น ช่วงนี้ ช่วงระหว่าง ช่วงแรก ช่วงหน้า ช่วงหลัง ช่วงๆ ช่วย ช้า ช้านาน ชาว ช้าๆ เช่นก่อน เช่นกัน เช่นเคย +เช่นดัง เช่นดังก่อน เช่นดังเก่า เช่นดังที่ เช่นดังว่า เช่นเดียวกัน เช่นเดียวกับ เช่นใด เช่นที่ เช่นที่เคย เช่นที่ว่า เช่นนั้น เช่นนั้นเอง เช่นนี้ เช่นเมื่อ เช่นไร เชื่อ +เชื่อถือ เชื่อมั่น เชื่อว่า ใช่ ใช่ไหม ใช้ ซะ ซะก่อน ซะจน ซะจนกระทั่ง ซะจนถึง ซึ่งได้แก่ ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น +ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดังกล่าว ดังกับว่า ดั่งกับว่า ดังเก่า ดั่งเก่า ดั่งเคย ต่างก็ ต่างหาก ตามด้วย ตามแต่ ตามที่ +ตามๆ เต็มไปด้วย เต็มไปหมด เต็มๆ แต่ก็ แต่ก่อน แต่จะ แต่เดิม แต่ต้อง แต่ถ้า แต่ทว่า แต่ที่ แต่นั้น แต่เพียง แต่เมื่อ แต่ไร แต่ละ แต่ว่า แต่ไหน แต่อย่างใด โต +โตๆ ใต้ ถ้าจะ ถ้าหาก ถึงแก่ ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือว่า ถูกต้อง ทว่า ทั้งนั้นด้วย ทั้งปวง ทั้งเป็น ทั้งมวล ทั้งสิ้น ทั้งหมด ทั้งหลาย ทั้งๆ ทัน +ทันใดนั้น ทันที ทันทีทันใด ทั่ว ทำไม ทำไร ทำให้ ทำๆ ที ที่จริง ที่ซึ่ง ทีเดียว ทีใด ที่ใด ที่ได้ ทีเถอะ ที่แท้ ที่แท้จริง ที่นั้น ที่นี้ ทีไร ทีละ ที่ละ +ที่แล้ว ที่ว่า ที่แห่งนั้น ที่ไหน ทีๆ ที่ๆ ทุกคน ทุกครั้ง ทุกครา ทุกคราว ทุกชิ้น ทุกตัว ทุกทาง ทุกที ทุกที่ ทุกเมื่อ ทุกวัน ทุกวันนี้ ทุกสิ่ง ทุกหน ทุกแห่ง ทุกอย่าง +ทุกอัน ทุกๆ เท่า เท่ากัน เท่ากับ เท่าใด เท่าที่ เท่านั้น เท่านี้ เท่าไร เท่าไหร่ แท้ แท้จริง เธอ นอกจากว่า น้อย น้อยกว่า น้อยๆ น่ะ นั้นไว นับแต่นี้ นาง +นางสาว น่าจะ นาน นานๆ นาย นำ นำพา นำมา นิด นิดหน่อย นิดๆ นี่ นี่ไง นี่นา นี่แน่ะ นี่แหละ นี้แหล่ นี่เอง นี้เอง นู่น นู้น เน้น เนี่ย +เนี่ยเอง ในช่วง ในที่ ในเมื่อ ในระหว่าง บน บอก บอกแล้ว บอกว่า บ่อย บ่อยกว่า บ่อยครั้ง บ่อยๆ บัดดล บัดเดี๋ยวนี้ บัดนั้น บัดนี้ บ้าง บางกว่า +บางขณะ บางครั้ง บางครา บางคราว บางที บางที่ บางแห่ง บางๆ ปฏิบัติ ประกอบ ประการ ประการฉะนี้ ประการใด ประการหนึ่ง ประมาณ ประสบ ปรับ +ปรากฏ ปรากฏว่า ปัจจุบัน ปิด เป็นด้วย เป็นดัง เป็นต้น เป็นแต่ เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอาทิ ผ่านๆ ผู้ ผู้ใด เผื่อ เผื่อจะ เผื่อที่ เผื่อว่า ฝ่าย +ฝ่ายใด พบว่า พยายาม พร้อมกัน พร้อมกับ พร้อมด้วย พร้อมทั้ง พร้อมที่ พร้อมเพียง พวก พวกกัน พวกกู พวกแก พวกเขา พวกคุณ พวกฉัน พวกท่าน +พวกที่ พวกเธอ พวกนั้น พวกนี้ พวกนู้น พวกโน้น พวกมัน พวกมึง พอ พอกัน พอควร พอจะ พอดี พอตัว พอที พอที่ พอเพียง พอแล้ว พอสม พอสมควร +พอเหมาะ พอๆ พา พึง พึ่ง พื้นๆ พูด เพราะฉะนั้น เพราะว่า เพิ่ง เพิ่งจะ เพิ่ม เพิ่มเติม เพียง เพียงแค่ เพียงใด เพียงแต่ เพียงพอ เพียงเพราะ +เพื่อว่า เพื่อให้ ภายใต้ มองว่า มั๊ย มากกว่า มากมาย มิ มิฉะนั้น มิใช่ มิได้ มีแต่ มึง มุ่ง มุ่งเน้น มุ่งหมาย เมื่อก่อน เมื่อครั้ง เมื่อครั้งก่อน +เมื่อคราวก่อน เมื่อคราวที่ เมื่อคราว เมื่อคืน เมื่อเช้า เมื่อใด เมื่อนั้น เมื่อนี้ เมื่อเย็น เมื่อไร เมื่อวันวาน เมื่อวาน เมื่อไหร่ แม้ แม้กระทั่ง แม้แต่ แม้นว่า แม้ว่า +ไม่ค่อย ไม่ค่อยจะ ไม่ค่อยเป็น ไม่ใช่ ไม่เป็นไร ไม่ว่า ยก ยกให้ ยอม ยอมรับ ย่อม ย่อย ยังคง ยังงั้น ยังงี้ ยังโง้น ยังไง ยังจะ ยังแต่ ยาก +ยาว ยาวนาน ยิ่ง ยิ่งกว่า ยิ่งขึ้น ยิ่งขึ้นไป ยิ่งจน ยิ่งจะ ยิ่งนัก ยิ่งเมื่อ ยิ่งแล้ว ยิ่งใหญ่ ร่วมกัน รวมด้วย ร่วมด้วย รือว่า เร็ว เร็วๆ เราๆ เรียก เรียบ เรื่อย +เรื่อยๆ ไร ล้วน ล้วนจน ล้วนแต่ ละ ล่าสุด เล็ก เล็กน้อย เล็กๆ เล่าว่า แล้วกัน แล้วแต่ แล้วเสร็จ วันใด วันนั้น วันนี้ วันไหน สบาย สมัย สมัยก่อน +สมัยนั้น สมัยนี้ สมัยโน้น ส่วนเกิน ส่วนด้อย ส่วนดี ส่วนใด ส่วนที่ ส่วนน้อย ส่วนนั้น ส่วนมาก ส่วนใหญ่ สั้น สั้นๆ สามารถ สำคัญ สิ่ง +สิ่งใด สิ่งนั้น สิ่งนี้ สิ่งไหน สิ้น เสร็จแล้ว เสียด้วย เสียแล้ว แสดง แสดงว่า หน หนอ หนอย หน่อย หมด หมดกัน หมดสิ้น หรือไง หรือเปล่า หรือไม่ หรือยัง +หรือไร หากแม้ หากแม้น หากแม้นว่า หากว่า หาความ หาใช่ หารือ เหตุ เหตุผล เหตุนั้น เหตุนี้ เหตุไร เห็นแก่ เห็นควร เห็นจะ เห็นว่า เหลือ เหลือเกิน เหล่า +เหล่านั้น เหล่านี้ แห่งใด แห่งนั้น แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น อย่างนี้ +อย่างโน้น อย่างมาก อย่างยิ่ง อย่างไร อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างไหน อย่างๆ อัน อันจะ อันใด อันได้แก่ อันที่ +อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันไหน อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ +""".split()) \ No newline at end of file diff --git a/spacy/th/tag_map.py b/spacy/th/tag_map.py new file mode 100644 index 000000000..e225f7289 --- /dev/null +++ b/spacy/th/tag_map.py @@ -0,0 +1,81 @@ +# encoding: utf8 +# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) +from __future__ import unicode_literals + +from ..symbols import * + +TAG_MAP = { + #NOUN + "NOUN": {POS: NOUN}, + "NCMN": {POS: NOUN}, + "NTTL": {POS: NOUN}, + "CNIT": {POS: NOUN}, + "CLTV": {POS: NOUN}, + "CMTR": {POS: NOUN}, + "CFQC": {POS: NOUN}, + "CVBL": {POS: NOUN}, + #PRON + "PRON": {POS: PRON}, + "NPRP": {POS: PRON}, + # ADJ + "ADJ": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, + # ADV + "ADV": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, + # INT + "INT": {POS: INTJ}, + # PRON + "PROPN": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, + # DET + "DET": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, + # NUM + "NUM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, + # AUX + "AUX": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, + # ADP + "ADP": {POS: ADP}, + "RPRE": {POS: ADP}, + # CCONJ + "CCONJ": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, + # SCONJ + "SCONJ": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, + # PART + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, + # PUNCT + "PUNCT": {POS: PUNCT}, + "PUNC": {POS: PUNCT} +} \ No newline at end of file diff --git a/spacy/th/tokenizer_exceptions.py b/spacy/th/tokenizer_exceptions.py new file mode 100644 index 000000000..0f933f1c1 --- /dev/null +++ b/spacy/th/tokenizer_exceptions.py @@ -0,0 +1,45 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "ม.ค.": [ + {ORTH: "ม.ค.", LEMMA: "มกราคม"} + ], + "ก.พ.": [ + {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} + ], + "มี.ค.": [ + {ORTH: "มี.ค.", LEMMA: "มีนาคม"} + ], + "เม.ย.": [ + {ORTH: "เม.ย.", LEMMA: "เมษายน"} + ], + "พ.ค.": [ + {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"} + ], + "มิ.ย.": [ + {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"} + ], + "ก.ค.": [ + {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"} + ], + "ส.ค.": [ + {ORTH: "ส.ค.", LEMMA: "สิงหาคม"} + ], + "ก.ย.": [ + {ORTH: "ก.ย.", LEMMA: "กันยายน"} + ], + "ต.ค.": [ + {ORTH: "ต.ค.", LEMMA: "ตุลาคม"} + ], + "พ.ย.": [ + {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"} + ], + "ธ.ค.": [ + {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} + ] +} \ No newline at end of file diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index c094bea0d..799e4bdaa 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -186,7 +186,13 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: - return False + # See 'flush_cache' below for hand-wringing about + # how to handle this. + cached = <_Cached*>self._specials.get(key) + if cached == NULL: + return False + else: + self._cache.set(key, cached) cdef int i if cached.is_lex: for i in range(cached.length): @@ -201,9 +207,15 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + special_case = self._specials.get(orig_key) + if special_case is not NULL: + for i in range(special_case.length): + tokens.push_back(&special_case.data.tokens[i], False) + self._cache.set(orig_key, special_case) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -300,7 +312,8 @@ cdef class Tokenizer: start = infix_end span = string[start:] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + if span: + tokens.push_back(self.vocab.get(tokens.mem, span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) @@ -389,5 +402,29 @@ cdef class Tokenizer: cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) self._specials.set(key, cached) - self._cache.set(key, cached) self._rules[string] = substrings + # After changing the tokenization rules, the previous tokenization + # may be stale. + self.flush_cache() + + def flush_cache(self): + '''Flush the tokenizer's cache. May not free memory immediately. + + This is called automatically after `add_special_case`, but if you + write to the prefix or suffix functions, you'll have to call this + yourself. You may also need to flush the tokenizer cache after + changing the lex_attr_getter functions. + ''' + cdef hash_t key + for key in self._cache.keys(): + special_case = self._specials.get(key) + # Don't free data shared with special-case rules + if special_case is not NULL: + continue + cached = <_Cached*>self._cache.get(key) + if cached is not NULL: + self.mem.free(cached) + self._cache = PreshMap(1000) + # We could here readd the data from specials --- but if we loop over + # a bunch of special-cases, we'll get a quadratic behaviour. The extra + # lookup isn't so bad? Tough to tell. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ca5a3d696..aca35a73f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -614,6 +614,56 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy doc. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + # Efficiency notes: + # + # We can easily improve the performance here by iterating in Cython. + # To loop over the tokens in Cython, the easiest way is: + # for token in doc.c[:doc.c.length]: + # head = token + token.head + # Both token and head will be TokenC* here. The token.head attribute + # is an integer offset. + def __pairwise_lca(token_j, token_k, lca_matrix): + if lca_matrix[token_j.i][token_k.i] != -2: + return lca_matrix[token_j.i][token_k.i] + elif token_j == token_k: + lca_index = token_j.i + elif token_k.head == token_j: + lca_index = token_j.i + elif token_j.head == token_k: + lca_index = token_k.i + elif (token_j.head == token_j) and (token_k.head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_matrix[token_j.i][token_k.i] = lca_index + lca_matrix[token_k.i][token_j.i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + for j in range(len(self)): + token_j = self[j] + for k in range(j, len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + def to_bytes(self): """ Serialize, producing a byte string. diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index f9b1f3972..81c0243ee 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -64,8 +64,9 @@ def parse_tree(doc, light=False, flat=False): >>> trees = doc.print_tree() [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] """ + doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], - doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) + doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) merge_ents(doc_clone) # merge the entities into single tokens first return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 09927ab4c..ae28f698a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -130,6 +130,58 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy span. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix, margins): + offset = margins[0] + token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k + token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j + token_j_i = token_j.i - offset + token_k_i = token_k.i - offset + + if lca_matrix[token_j_i][token_k_i] != -2: + return lca_matrix[token_j_i][token_k_i] + elif token_j == token_k: + lca_index = token_j_i + elif token_k_head == token_j: + lca_index = token_j_i + elif token_j_head == token_k: + lca_index = token_k_i + elif (token_j_head == token_j) and (token_k_head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) + + lca_matrix[token_j_i][token_k_i] = lca_index + lca_matrix[token_k_i][token_j_i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + margins = [self.start, self.end] + + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + + + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ @@ -230,7 +282,7 @@ cdef class Span: # so it's okay once we have the Span objects. See Issue #375 spans = [] for start, end, label in self.doc.noun_chunks_iterator(self): - spans.append(Span(self, start, end, label=label)) + spans.append(Span(self.doc, start, end, label=label)) for span in spans: yield span diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index 1847a7d8d..bde0054b5 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -7,5 +7,6 @@ class Chinese(Language): def make_doc(self, text): import jieba - words = list(jieba.cut(text, cut_all=True)) + words = list(jieba.cut(text, cut_all=False)) + words=[x for x in words if x] return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/website/_harp.json b/website/_harp.json index cb476541a..37a0b54dd 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.8", + "SPACY_VERSION": "1.9", "LATEST_NEWS": { "url": "/docs/usage/models", "title": "The first official Spanish model is here!" diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index ea3a225bf..51db4f4e2 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -21,7 +21,7 @@ p +pos-row("$", "SYM", "SymType=currency", "symbol, currency") +pos-row("ADD", "X", "", "email") +pos-row("AFX", "ADJ", "Hyph=yes", "affix") - +pos-row("BES", "VERB", "", 'auxillary "be"') + +pos-row("BES", "VERB", "", 'auxiliary "be"') +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating") +pos-row("CD", "NUM", "NumType=card", "cardinal number") +pos-row("DT", "DET", "determiner") @@ -35,7 +35,7 @@ p +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative") +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative") +pos-row("LS", "PUNCT", "NumType=ord", "list item marker") - +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary") + +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary") +pos-row("NFP", "PUNCT", "", "superfluous punctuation") +pos-row("NIL", "", "", "missing tag") +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass") diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 8c6b8fb10..d4b01a819 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -38,6 +38,11 @@ p +h(2, "pos-tagging") Part-of-speech Tagging ++infobox("Tip: Understanding tags") + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -65,10 +70,20 @@ p +h(2, "dependency-parsing") Syntactic Dependency Parsing ++infobox("Tip: Understanding labels") + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition ++infobox("Tip: Understanding entity types") + | In spaCy v1.9+, you can also use #[code spacy.explain()] to get the + | description for the string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + include _annotation/_named-entities +h(2, "json-input") JSON input format for training diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index adcd111a3..1c2911f52 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -272,7 +272,7 @@ p Import the document contents from a binary string. p | Retokenize the document, such that the span at | #[code doc.text[start_idx : end_idx]] is merged into a single token. If - | #[code start_idx] and #[end_idx] do not mark start and end token + | #[code start_idx] and #[code end_idx] do not mark start and end token | boundaries, the document remains unchanged. +table(["Name", "Type", "Description"]) diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade index 018790145..21481cf65 100644 --- a/website/docs/api/features.jade +++ b/website/docs/api/features.jade @@ -18,7 +18,7 @@ p | consisting of the words to be processed. p - | Each state consists of the words on the stack (if any), which consistute + | Each state consists of the words on the stack (if any), which constitute | the current entity being constructed. We also have the current word, and | the two subsequent words. Finally, we also have the entities previously | built. diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade index 24f3d4458..7e3f1a906 100644 --- a/website/docs/api/index.jade +++ b/website/docs/api/index.jade @@ -6,7 +6,7 @@ include ../../_includes/_mixins p | Here's a quick comparison of the functionalities offered by spaCy, - | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet], + | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet], | #[+a("http://www.nltk.org/py-modindex.html") NLTK] and | #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP]. @@ -107,7 +107,7 @@ p p | In 2016, Google released their - | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet] + | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet] | library, setting a new state of the art for syntactic dependency parsing | accuracy. SyntaxNet's algorithm is very similar to spaCy's. The main | difference is that SyntaxNet uses a neural network while spaCy uses a @@ -129,7 +129,7 @@ p +cell=data +row - +cell #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") Parsey McParseface] + +cell #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") Parsey McParseface] each data in [ 94.15, 89.08, 94.77 ] +cell=data diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 770ee3e9b..d2d3d0f27 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -222,7 +222,7 @@ p The sentence span that this span is a part of. p | The token within the span that's highest in the parse tree. If there's a - | tie, the earlist is prefered. + | tie, the earliest is preferred. +table(["Name", "Type", "Description"]) +footrow diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index 7490bccf4..c036c650b 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -124,7 +124,7 @@ p +cell #[code Lexeme] +cell The lexeme indicated by the given ID. -+h(2, "iter") Span.__iter__ ++h(2, "iter") Vocab.__iter__ +tag method p Iterate over the lexemes in the vocabulary. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 703a185d6..c2ce271aa 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -313,7 +313,7 @@ "author": "Clark Grubb" }, "A very (very) short primer on spacy.io": { - "url": "http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html", + "url": "https://web.archive.org/web/20161219095416/http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html", "author": "Nimrod Milo " } }, diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 30c4486b0..02dfb79ca 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -28,7 +28,7 @@ p | #[a(href="#word-vectors") word vectors]. +item - | #[strong Set up] a #[a(href="#model-directory") model direcory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. + | #[strong Set up] a #[a(href="#model-directory") model directory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. p | For some languages, you may also want to develop a solution for @@ -303,7 +303,7 @@ p p | Because languages can vary in quite arbitrary ways, spaCy avoids | organising the language data into an explicit inheritance hierarchy. - | Instead, reuseable functions and data are collected as atomic pieces in + | Instead, reusable functions and data are collected as atomic pieces in | the #[code spacy.language_data] package. +aside-code("Example"). @@ -525,13 +525,13 @@ p | └── oov_prob # optional ├── pos/ # optional | ├── model # via nlp.tagger.model.dump(path) - | └── config.json # via Langage.train + | └── config.json # via Language.train ├── deps/ # optional | ├── model # via nlp.parser.model.dump(path) - | └── config.json # via Langage.train + | └── config.json # via Language.train └── ner/ # optional ├── model # via nlp.entity.model.dump(path) - └── config.json # via Langage.train + └── config.json # via Language.train p | This creates a spaCy data directory with a vocabulary model, ready to be diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index b1fbba652..173521a33 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -21,7 +21,7 @@ p +h(2, "special-cases") Adding special case tokenization rules p - | Most domains have at least some idiosyncracies that require custom + | Most domains have at least some idiosyncrasies that require custom | tokenization rules. Here's how to add a special case rule to an existing | #[+api("tokenizer") #[code Tokenizer]] instance: @@ -40,7 +40,9 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] + # Pronoun lemma is returned as -PRON- + # More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -85,8 +87,8 @@ p | algorithm in Python, optimized for readability rather than performance: +code. - def tokenizer_pseudo_code(text, find_prefix, find_suffix, - find_infixes, special_cases): + def tokenizer_pseudo_code(text, special_cases, + find_prefix, find_suffix, find_infixes): tokens = [] for substring in text.split(' '): suffixes = [] @@ -138,7 +140,7 @@ p p | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | are five things you would need to define: +list("numbers") +item @@ -160,6 +162,11 @@ p | A function #[code infixes_finditer], to handle non-whitespace | separators, such as hyphens etc. + +item + | (Optional) A boolean function #[code token_match] matching strings + | that should never be split, overriding the previous rules. + | Useful for things like URLs or numbers. + p | You shouldn't usually need to create a #[code Tokenizer] subclass. | Standard usage is to use #[code re.compile()] to build a regular @@ -172,12 +179,18 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~]''') + simple_url_re = re.compile(r'''^https?://''') def create_tokenizer(nlp): return Tokenizer(nlp.vocab, + rules={}, prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match + ) - nlp = spacy.load('en', tokenizer=create_make_doc) + nlp = spacy.load('en', create_make_doc=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to @@ -214,7 +227,7 @@ p def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer - spaces = [True] * len(word) + spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces) p diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 9ad2fde5f..092c996b3 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -87,7 +87,7 @@ p | The other way to install spaCy is to clone its | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is | the common way if you want to make changes to the code base. You'll need to - | make sure that you have a development enviroment consisting of a Python + | make sure that you have a development environment consisting of a Python | distribution including header files, a compiler, | #[+a("https://pip.pypa.io/en/latest/installing/") pip], | #[+a("https://virtualenv.pypa.io/") virtualenv] and diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 138b0058d..2fd390d26 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -83,7 +83,7 @@ p +h(2, "examples-word-vectors") Word vectors +code. - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") + doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.") apples = doc[0] oranges = doc[2] diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 9bb75ba9a..4951ea211 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -67,7 +67,7 @@ p python -m spacy download en_core_web_md # download exact model version (doesn't create shortcut link) - python -m spacy download en_core_web_md-1.2.0 --direct + python -m spacy download en_core_web_md-1.2.1 --direct p | The download command will #[+a("#download-pip") install the model] via @@ -96,10 +96,10 @@ p +code(false, "bash"). # with external URL - pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz + pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.1/en_core_web_md-1.2.1.tar.gz # with local file - pip install /Users/you/en_core_web_md-1.2.0.tar.gz + pip install /Users/you/en_core_web_md-1.2.1.tar.gz p | By default, this will install the model into your #[code site-packages] @@ -198,12 +198,43 @@ p nlp = en_core_web_md.load() doc = nlp(u'This is a sentence.') ++h(3, "models-download") Downloading and requiring model dependencies + +p + | spaCy's built-in #[+api("cli#download") #[code download]] command + | is mostly intended as a convenient, interactive wrapper. It performs + | compatibility checks and prints detailed error messages and warnings. + | However, if you're downloading models as part of an automated build + | process, this only adds an unnecessary layer of complexity. If you know + | which models your application needs, you should be specifying them directly. + ++aside("Prevent re-downloading models") + | If you're installing a model from a URL, pip will usually re-download and + | re-install the package, even if you already have a matching + | version installed. To prevent this, simply add #[code #egg=] and the + | package name after the URL, e.g. #[code #egg=en_core_web_sm] or + | #[code #egg=en_core_web_sm-1.2.0]. This tells pip which package and version + | you're trying to download, and will skip the package if a matching + | installation is found. + +p + | Because all models are valid Python packages, you can add them to your + | application's #[code requirements.txt]. If you're running your own + | internal PyPi installation, you can simply upload the models there. pip's + | #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format] + | supports both package names to download via a PyPi server, as well as direct + | URLs. + ++code("requirements.txt", "text"). + spacy>=1.8.0,<2.0.0 + -e #{gh("spacy-models")}/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz#egg=en_core_web_sm-1.2.0 + +h(2, "own-models") Using your own models p | If you've trained your own model, for example for | #[+a("/docs/usage/adding-languages") additional languages] or - | #[+a("/docs/usage/train-ner") custom named entities], you can save its + | #[+a("/docs/usage/training-ner") custom named entities], you can save its | state using the #[code Language.save_to_directory()] method. To make the | model more convenient to deploy, we recommend wrapping it as a Python | package. diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index cded00b6c..3f22ab43f 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -50,7 +50,7 @@ p +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres] +row - +cell I read the paper yesteday + +cell I read the paper yesterday +cell read +cell read +cell verb diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 4bd6132d2..600654f65 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -98,7 +98,8 @@ p | important metadata, e.g. a JSON document. To pair up the metadata | with the processed #[code Doc] object, you should use the tee | function to split the generator in two, and then #[code izip] the - | extra stream to the document stream. + | extra stream to the document stream. Here's an + | #[a(href="https://github.com/explosion/spaCy/issues/172#issuecomment-183963403")= "example"] +h(2, "own-annotations") Bringing your own annotations diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index c4eb08f04..56b218c29 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -28,7 +28,7 @@ p | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + | #[+a("/docs/usage/cli#package") #[code package] command] documentation. +aside-code("meta.json", "json"). { @@ -58,7 +58,7 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+a(gh("spacy-dev-resources", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 78eb4905e..52eedd21e 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -150,8 +150,8 @@ p for itn in range(20): random.shuffle(train_data) for raw_text, entity_offsets in train_data: - gold = GoldParse(doc, entities=entity_offsets) doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) nlp.tagger(doc) loss = nlp.entity.update(doc, gold) nlp.end_training() diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 8a5c111bd..3a15ae2a1 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -33,12 +33,14 @@ p from spacy.vocab import Vocab from spacy.pipeline import EntityRecognizer from spacy.tokens import Doc + from spacy.gold import GoldParse vocab = Vocab() entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC']) doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + gold = GoldParse(doc, entities=['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + entity.update(doc, gold) entity.model.end_training() @@ -65,13 +67,14 @@ p.o-inline-list from spacy.vocab import Vocab from spacy.pipeline import DependencyParser from spacy.tokens import Doc + from spacy.gold import GoldParse vocab = Vocab() parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct']) doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'), - (1, 'punct')]) + gold = GoldParse(doc, [1,1,3,1,1], ['nsubj', 'ROOT', 'compound', 'dobj', 'punct']) + parser.update(doc, gold) parser.model.end_training() @@ -120,7 +123,7 @@ p +code. from spacy.vocab import Vocab - from spacy.pipeline import Tagger + from spacy.tagger import Tagger from spacy.tagger import P2_orth, P1_orth from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 3cc0a67a8..3fd6326d1 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -21,10 +21,12 @@ p +code. import numpy + import spacy + nlp = spacy.load('en') apples, and_, oranges = nlp(u'apples and oranges') print(apples.vector.shape) - # (1,) + # (300,) apples.similarity(oranges) p