diff --git a/.github/contributors/Arvindcheenu.md b/.github/contributors/Arvindcheenu.md new file mode 100644 index 000000000..707a9821d --- /dev/null +++ b/.github/contributors/Arvindcheenu.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Arvind Srinivasan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-13 | +| GitHub username | arvindcheenu | +| Website (optional) | | diff --git a/.github/contributors/JannisTriesToCode.md b/.github/contributors/JannisTriesToCode.md new file mode 100644 index 000000000..d834794c5 --- /dev/null +++ b/.github/contributors/JannisTriesToCode.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ----------------------------- | +| Name | Jannis Rauschke | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.05.2020 | +| GitHub username | JannisTriesToCode | +| Website (optional) | https://twitter.com/JRauschke | diff --git a/.github/contributors/hiroshi-matsuda-rit.md b/.github/contributors/hiroshi-matsuda-rit.md new file mode 100644 index 000000000..bf19125fb --- /dev/null +++ b/.github/contributors/hiroshi-matsuda-rit.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Hiroshi Matsuda | +| Company name (if applicable) | Megagon Labs, Tokyo | +| Title or role (if applicable) | Research Scientist | +| Date | June 6, 2020 | +| GitHub username | hiroshi-matsuda-rit | +| Website (optional) | | diff --git a/.github/contributors/jonesmartins.md b/.github/contributors/jonesmartins.md new file mode 100644 index 000000000..5663f6193 --- /dev/null +++ b/.github/contributors/jonesmartins.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jones Martins | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-10 | +| GitHub username | jonesmartins | +| Website (optional) | | diff --git a/.github/contributors/leomrocha.md b/.github/contributors/leomrocha.md new file mode 100644 index 000000000..495654153 --- /dev/null +++ b/.github/contributors/leomrocha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Leonardo M. Rocha | +| Company name (if applicable) | | +| Title or role (if applicable) | Eng. | +| Date | 31/05/2020 | +| GitHub username | leomrocha | +| Website (optional) | | diff --git a/.github/contributors/theudas.md b/.github/contributors/theudas.md new file mode 100644 index 000000000..3d8a2bd95 --- /dev/null +++ b/.github/contributors/theudas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Philipp Sodmann | +| Company name (if applicable) | Empolis | +| Title or role (if applicable) | | +| Date | 2017-05-06 | +| GitHub username | theudas | +| Website (optional) | | diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml new file mode 100644 index 000000000..3fb42ed01 --- /dev/null +++ b/.github/workflows/issue-manager.yml @@ -0,0 +1,29 @@ +name: Issue Manager + +on: + schedule: + - cron: "0 0 * * *" + issue_comment: + types: + - created + - edited + issues: + types: + - labeled + +jobs: + issue-manager: + runs-on: ubuntu-latest + steps: + - uses: tiangolo/issue-manager@0.2.1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + config: > + { + "resolved": { + "delay": "P7D", + "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", + "remove_label_on_comment": true, + "remove_label_on_close": true + } + } diff --git a/Makefile b/Makefile index cf96d6294..865bf44c5 100644 --- a/Makefile +++ b/Makefile @@ -5,8 +5,9 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core chmod a+rx $@ + cp $@ dist/spacy.pex dist/pytest.pex : wheelhouse/pytest-*.whl $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock @@ -14,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index f3e493f6a..d29e20ad1 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -187,7 +187,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats): width=("Width of CNN layers", "positional", None, int), embed_size=("Embedding rows", "positional", None, int), pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), + train_iters=("Number of iterations to train", "option", "tn", int), train_examples=("Number of labelled examples", "option", "eg", int), vectors_model=("Name or path to vectors model to learn from"), ) diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index d2472b6b9..a91102093 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -2,7 +2,7 @@ # coding: utf-8 """Using the parser to recognise your own semantics -spaCy's parser component can be used to trained to predict any type of tree +spaCy's parser component can be trained to predict any type of tree structure over your input text. You can also predict trees over whole documents or chat logs, with connections between the sentence-roots used to annotate discourse structure. In this example, we'll build a message parser for a common diff --git a/pyproject.toml b/pyproject.toml index 827e2a797..fe66494ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,6 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==7.4.0", + "thinc==7.4.1", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index ec30efc16..b93def651 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0 +thinc==7.4.1 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index af3579f88..e556ba19c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0 + thinc==7.4.1 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0 + thinc==7.4.1 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=1.0.2,<1.1.0 @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.3.1,<0.4.0 + spacy_lookups_data>=0.3.2,<0.4.0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = @@ -78,7 +78,8 @@ cuda102 = cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = - fugashi>=0.1.3 + sudachipy>=0.4.5 + sudachidict_core>=20200330 ko = natto-py==0.9.0 th = diff --git a/spacy/about.py b/spacy/about.py index 84dc86aa8..90b5f9245 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.4" +__version__ = "2.3.0.dev1" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ce095c15..d4de9aeb4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -15,6 +15,7 @@ import random from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu +from ..errors import Errors from ..gold import GoldCorpus from ..compat import path2str from ..lookups import Lookups @@ -182,6 +183,7 @@ def train( msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 + base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) @@ -227,6 +229,7 @@ def train( exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) + base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline] ) @@ -299,7 +302,7 @@ def train( # Load in pretrained weights if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) + components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config @@ -642,7 +645,7 @@ def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) -def _load_pretrained_tok2vec(nlp, loc): +def _load_pretrained_tok2vec(nlp, loc, base_components): """Load pretrained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ @@ -651,6 +654,8 @@ def _load_pretrained_tok2vec(nlp, loc): loaded = [] for name, component in nlp.pipeline: if hasattr(component, "model") and hasattr(component.model, "tok2vec"): + if name in base_components: + raise ValueError(Errors.E200.format(component=name)) component.tok2vec.from_bytes(weights_data) loaded.append(name) return loaded diff --git a/spacy/errors.py b/spacy/errors.py index 6d92545d7..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -115,6 +115,25 @@ class Warnings(object): "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") + W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and " + "is incompatible with the current spaCy version ({current}). This " + "may lead to unexpected results or runtime errors. To resolve " + "this, download a newer compatible model or retrain your custom " + "model with the current spaCy version. For more details and " + "available updates, run: python -m spacy validate") + W032 = ("Unable to determine model compatibility for model '{model}' " + "({model_version}) with the current spaCy version ({current}). " + "This may lead to unexpected results or runtime errors. To resolve " + "this, download a newer compatible model or retrain your custom " + "model with the current spaCy version. For more details and " + "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes @@ -568,6 +587,8 @@ class Errors(object): E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") + E200 = ("Specifying a base model with a pretrained component '{component}' " + "can not be combined with adding a pretrained Tok2Vec layer.") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cf67a2ac7..e69ff5933 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -640,6 +640,7 @@ cdef class GoldParse: representing the external IDs in a knowledge base (KB) mapped to either 1.0 or 0.0, indicating positive and negative examples respectively. + make_projective (bool): Whether to projectivize the dependency tree. RETURNS (GoldParse): The newly constructed object. """ self.mem = Pool() diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 6a553052b..964a714ae 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -139,7 +139,7 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: +for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, @@ -399,6 +399,14 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], + "c'mon": [ + {ORTH: "c'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ], + "C'mon": [ + {ORTH: "C'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ] } _exc.update(_other_exc) diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..7ab0a7dfe 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -18,5 +18,9 @@ sentences = [ "El gato come pescado.", "Veo al hombre con el telescopio.", "La araña come moscas.", - "El pingüino incuba en su nido.", + "El pingüino incuba en su nido sobre el hielo.", + "¿Dónde estais?", + "¿Quién es el presidente Francés?", + "¿Dónde está encuentra la capital de Argentina?", + "¿Cuándo nació José de San Martín?", ] diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 2c2631086..891323705 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA -_exc = { - "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], - "pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}], -} +_exc = {} for exc_data in [ + {ORTH: "n°", LEMMA: "número"}, + {ORTH: "°C", LEMMA: "grados Celcius"}, {ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "dna.", LEMMA: "docena"}, + {ORTH: "dpto.", LEMMA: "departamento"}, + {ORTH: "ej.", LEMMA: "ejemplo"}, {ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "pág.", LEMMA: "página"}, {ORTH: "p.ej.", LEMMA: "por ejemplo"}, @@ -20,6 +21,8 @@ for exc_data in [ {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, + {ORTH: "vol.", NORM: "volúmen"}, + ]: _exc[exc_data[ORTH]] = [exc_data] @@ -39,10 +42,14 @@ for h in range(1, 12 + 1): for orth in [ "a.C.", "a.J.C.", + "d.C.", + "d.J.C.", "apdo.", "Av.", "Avda.", "Cía.", + "Dr.", + "Dra.", "EE.UU.", "etc.", "fig.", @@ -58,8 +65,10 @@ for orth in [ "Prof.", "Profa.", "q.e.p.d.", + "Q.E.P.D." "S.A.", "S.L.", + "S.R.L." "s.s.s.", "Sr.", "Sra.", diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py index c9fcfff2d..0fcf02351 100644 --- a/spacy/lang/fr/_tokenizer_exceptions_list.py +++ b/spacy/lang/fr/_tokenizer_exceptions_list.py @@ -534,7 +534,6 @@ FR_BASE_EXCEPTIONS = [ "Beaumont-Hamel", "Beaumont-Louestault", "Beaumont-Monteux", - "Beaumont-Pied-de-Bœuf", "Beaumont-Pied-de-Bœuf", "Beaumont-Sardolles", "Beaumont-Village", @@ -951,7 +950,7 @@ FR_BASE_EXCEPTIONS = [ "Buxières-sous-les-Côtes", "Buzy-Darmont", "Byhleguhre-Byhlen", - "Bœurs-en-Othe", + "Bœurs-en-Othe", "Bâle-Campagne", "Bâle-Ville", "Béard-Géovreissiat", @@ -1589,11 +1588,11 @@ FR_BASE_EXCEPTIONS = [ "Cruci-Falgardiens", "Cruquius-Oost", "Cruviers-Lascours", - "Crèvecœur-en-Auge", - "Crèvecœur-en-Brie", - "Crèvecœur-le-Grand", - "Crèvecœur-le-Petit", - "Crèvecœur-sur-l'Escaut", + "Crèvecœur-en-Auge", + "Crèvecœur-en-Brie", + "Crèvecœur-le-Grand", + "Crèvecœur-le-Petit", + "Crèvecœur-sur-l'Escaut", "Crécy-Couvé", "Créon-d'Armagnac", "Cubjac-Auvézère-Val-d'Ans", @@ -1619,7 +1618,7 @@ FR_BASE_EXCEPTIONS = [ "Cuxac-Cabardès", "Cuxac-d'Aude", "Cuyk-Sainte-Agathe", - "Cœuvres-et-Valsery", + "Cœuvres-et-Valsery", "Céaux-d'Allègre", "Céleste-Empire", "Cénac-et-Saint-Julien", @@ -1682,7 +1681,7 @@ FR_BASE_EXCEPTIONS = [ "Devrai-Gondragnières", "Dhuys et Morin-en-Brie", "Diane-Capelle", - "Dieffenbach-lès-Wœrth", + "Dieffenbach-lès-Wœrth", "Diekhusen-Fahrstedt", "Diennes-Aubigny", "Diensdorf-Radlow", @@ -1755,7 +1754,7 @@ FR_BASE_EXCEPTIONS = [ "Durdat-Larequille", "Durfort-Lacapelette", "Durfort-et-Saint-Martin-de-Sossenac", - "Dœuil-sur-le-Mignon", + "Dœuil-sur-le-Mignon", "Dão-Lafões", "Débats-Rivière-d'Orpra", "Décines-Charpieu", @@ -2690,8 +2689,8 @@ FR_BASE_EXCEPTIONS = [ "Kuhlen-Wendorf", "KwaZulu-Natal", "Kyzyl-Arvat", - "Kœur-la-Grande", - "Kœur-la-Petite", + "Kœur-la-Grande", + "Kœur-la-Petite", "Kölln-Reisiek", "Königsbach-Stein", "Königshain-Wiederau", @@ -4027,7 +4026,7 @@ FR_BASE_EXCEPTIONS = [ "Marcilly-d'Azergues", "Marcillé-Raoul", "Marcillé-Robert", - "Marcq-en-Barœul", + "Marcq-en-Barœul", "Marcy-l'Etoile", "Marcy-l'Étoile", "Mareil-Marly", @@ -4261,7 +4260,7 @@ FR_BASE_EXCEPTIONS = [ "Monlezun-d'Armagnac", "Monléon-Magnoac", "Monnetier-Mornex", - "Mons-en-Barœul", + "Mons-en-Barœul", "Monsempron-Libos", "Monsteroux-Milieu", "Montacher-Villegardin", @@ -4351,7 +4350,7 @@ FR_BASE_EXCEPTIONS = [ "Mornay-Berry", "Mortain-Bocage", "Morteaux-Couliboeuf", - "Morteaux-Coulibœuf", + "Morteaux-Coulibœuf", "Morteaux-Coulibœuf", "Mortes-Frontières", "Mory-Montcrux", @@ -4394,7 +4393,7 @@ FR_BASE_EXCEPTIONS = [ "Muncq-Nieurlet", "Murtin-Bogny", "Murtin-et-le-Châtelet", - "Mœurs-Verdey", + "Mœurs-Verdey", "Ménestérol-Montignac", "Ménil'muche", "Ménil-Annelles", @@ -4615,7 +4614,7 @@ FR_BASE_EXCEPTIONS = [ "Neuves-Maisons", "Neuvic-Entier", "Neuvicq-Montguyon", - "Neuville-lès-Lœuilly", + "Neuville-lès-Lœuilly", "Neuvy-Bouin", "Neuvy-Deux-Clochers", "Neuvy-Grandchamp", @@ -4776,8 +4775,8 @@ FR_BASE_EXCEPTIONS = [ "Nuncq-Hautecôte", "Nurieux-Volognat", "Nuthe-Urstromtal", - "Nœux-les-Mines", - "Nœux-lès-Auxi", + "Nœux-les-Mines", + "Nœux-lès-Auxi", "Nâves-Parmelan", "Nézignan-l'Evêque", "Nézignan-l'Évêque", @@ -5346,7 +5345,7 @@ FR_BASE_EXCEPTIONS = [ "Quincy-Voisins", "Quincy-sous-le-Mont", "Quint-Fonsegrives", - "Quœux-Haut-Maînil", + "Quœux-Haut-Maînil", "Quœux-Haut-Maînil", "Qwa-Qwa", "R.-V.", @@ -5634,12 +5633,12 @@ FR_BASE_EXCEPTIONS = [ "Saint Aulaye-Puymangou", "Saint Geniez d'Olt et d'Aubrac", "Saint Martin de l'If", - "Saint-Denœux", - "Saint-Jean-de-Bœuf", - "Saint-Martin-le-Nœud", - "Saint-Michel-Tubœuf", + "Saint-Denœux", + "Saint-Jean-de-Bœuf", + "Saint-Martin-le-Nœud", + "Saint-Michel-Tubœuf", "Saint-Paul - Flaugnac", - "Saint-Pierre-de-Bœuf", + "Saint-Pierre-de-Bœuf", "Saint-Thegonnec Loc-Eguiner", "Sainte-Alvère-Saint-Laurent Les Bâtons", "Salignac-Eyvignes", @@ -6211,7 +6210,7 @@ FR_BASE_EXCEPTIONS = [ "Tite-Live", "Titisee-Neustadt", "Tobel-Tägerschen", - "Togny-aux-Bœufs", + "Togny-aux-Bœufs", "Tongre-Notre-Dame", "Tonnay-Boutonne", "Tonnay-Charente", @@ -6339,7 +6338,7 @@ FR_BASE_EXCEPTIONS = [ "Vals-près-le-Puy", "Valverde-Enrique", "Valzin-en-Petite-Montagne", - "Vandœuvre-lès-Nancy", + "Vandœuvre-lès-Nancy", "Varces-Allières-et-Risset", "Varenne-l'Arconce", "Varenne-sur-le-Doubs", @@ -6460,9 +6459,9 @@ FR_BASE_EXCEPTIONS = [ "Villenave-d'Ornon", "Villequier-Aumont", "Villerouge-Termenès", - "Villers-aux-Nœuds", + "Villers-aux-Nœuds", "Villez-sur-le-Neubourg", - "Villiers-en-Désœuvre", + "Villiers-en-Désœuvre", "Villieu-Loyes-Mollon", "Villingen-Schwenningen", "Villié-Morgon", @@ -6470,7 +6469,7 @@ FR_BASE_EXCEPTIONS = [ "Vilosnes-Haraumont", "Vilters-Wangs", "Vincent-Froideville", - "Vincy-Manœuvre", + "Vincy-Manœuvre", "Vincy-Manœuvre", "Vincy-Reuil-et-Magny", "Vindrac-Alayrac", @@ -6514,8 +6513,8 @@ FR_BASE_EXCEPTIONS = [ "Vrigne-Meusiens", "Vrijhoeve-Capelle", "Vuisternens-devant-Romont", - "Vœlfling-lès-Bouzonville", - "Vœuil-et-Giget", + "Vœlfling-lès-Bouzonville", + "Vœuil-et-Giget", "Vélez-Blanco", "Vélez-Málaga", "Vélez-Rubio", @@ -6618,7 +6617,7 @@ FR_BASE_EXCEPTIONS = [ "Wust-Fischbeck", "Wutha-Farnroda", "Wy-dit-Joli-Village", - "Wœlfling-lès-Sarreguemines", + "Wœlfling-lès-Sarreguemines", "Wünnewil-Flamatt", "X-SAMPA", "X-arbre", diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4eb4c1568..933607bdf 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .punctuation import ELISION, HYPHENS -from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA @@ -455,9 +454,6 @@ _regular_exp += [ for hc in _hyphen_combination ] -# URLs -_regular_exp.append(URL_PATTERN) - TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..a010bb7ae 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") -_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -21,7 +20,8 @@ _prefixes = ( ) _suffixes = ( - LIST_PUNCT + [r"\+"] + + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + [_concat_icons] @@ -29,7 +29,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..d328baa22 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from ..punctuation import ALPHA_LOWER, CURRENCY -from ..tokenizer_exceptions import URL_PATTERN from ...symbols import ORTH @@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( TOKENIZER_EXCEPTIONS = _exc -TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match +TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..a7ad0846e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,114 +1,279 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -import re -from collections import namedtuple +import srsly +from collections import namedtuple, OrderedDict from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP +from .tag_orth_map import TAG_ORTH_MAP +from .tag_bigram_map import TAG_BIGRAM_MAP from ...attrs import LANG -from ...language import Language -from ...tokens import Doc from ...compat import copy_reg +from ...errors import Errors +from ...language import Language +from ...symbols import POS +from ...tokens import Doc from ...util import DummyTokenizer +from ... import util + + +# Hold the attributes we need with convenient names +DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) # Handling for multiple spaces in a row is somewhat awkward, this simplifies # the flow by creating a dummy with the same interface. -DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) -DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) -DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) +DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) +DummySpace = DummyNode(" ", " ", " ") -def try_fugashi_import(): - """Fugashi is required for Japanese support, so check for it. - It it's not available blow up and explain how to fix it.""" +def try_sudachi_import(split_mode="A"): + """SudachiPy is required for Japanese support, so check for it. + It it's not available blow up and explain how to fix it. + split_mode should be one of these values: "A", "B", "C", None->"A".""" try: - import fugashi - - return fugashi + from sudachipy import dictionary, tokenizer + split_mode = { + None: tokenizer.Tokenizer.SplitMode.A, + "A": tokenizer.Tokenizer.SplitMode.A, + "B": tokenizer.Tokenizer.SplitMode.B, + "C": tokenizer.Tokenizer.SplitMode.C, + }[split_mode] + tok = dictionary.Dictionary().create( + mode=split_mode + ) + return tok except ImportError: raise ImportError( - "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." ) -def resolve_pos(token): +def resolve_pos(orth, pos, next_pos): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context - in the sentence. This function adds information to the POS tag to - resolve ambiguous mappings. + in the sentence. This function returns resolved POSs for both token + and next_token by tuple. """ - # this is only used for consecutive ascii spaces - if token.surface == " ": - return "空白" + # Some tokens have their UD tag decided based on the POS of the following + # token. - # TODO: This is a first take. The rules here are crude approximations. - # For many of these, full dependencies are needed to properly resolve - # PoS mappings. - if token.pos == "連体詞,*,*,*": - if re.match(r"[こそあど此其彼]の", token.surface): - return token.pos + ",DET" - if re.match(r"[こそあど此其彼]", token.surface): - return token.pos + ",PRON" - return token.pos + ",ADJ" - return token.pos + # orth based rules + if pos[0] in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[pos[0]] + if orth in orth_map: + return orth_map[orth], None + + # tag bi-gram mapping + if next_pos: + tag_bigram = pos[0], next_pos[0] + if tag_bigram in TAG_BIGRAM_MAP: + bipos = TAG_BIGRAM_MAP[tag_bigram] + if bipos[0] is None: + return TAG_MAP[pos[0]][POS], bipos[1] + else: + return bipos + + return TAG_MAP[pos[0]][POS], None -def get_words_and_spaces(tokenizer, text): - """Get the individual tokens that make up the sentence and handle white space. +# Use a mapping of paired punctuation to avoid splitting quoted sentences. +pairpunct = {'「':'」', '『': '』', '【': '】'} - Japanese doesn't usually use white space, and MeCab's handling of it for - multiple spaces in a row is somewhat awkward. + +def separate_sentences(doc): + """Given a doc, mark tokens that start sentences based on Unidic tags. """ - tokens = tokenizer.parseToNodeList(text) + stack = [] # save paired punctuation + for i, token in enumerate(doc[:-2]): + # Set all tokens after the first to false by default. This is necessary + # for the doc code to be aware we've done sentencization, see + # `is_sentenced`. + token.sent_start = (i == 0) + if token.tag_: + if token.tag_ == "補助記号-括弧開": + ts = str(token) + if ts in pairpunct: + stack.append(pairpunct[ts]) + elif stack and ts == stack[-1]: + stack.pop() + + if token.tag_ == "補助記号-句点": + next_token = doc[i+1] + if next_token.tag_ != token.tag_ and not stack: + next_token.sent_start = True + + +def get_dtokens(tokenizer, text): + tokens = tokenizer.tokenize(text) words = [] - spaces = [] - for token in tokens: - # If there's more than one space, spaces after the first become tokens - for ii in range(len(token.white_space) - 1): - words.append(DummySpace) - spaces.append(False) + for ti, token in enumerate(tokens): + tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) + inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) + dtoken = DetailedToken( + token.surface(), + (tag, inf), + token.dictionary_form()) + if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': + # don't add multiple space tokens in a row + continue + words.append(dtoken) - words.append(token) - spaces.append(bool(token.white_space)) - return words, spaces + # remove empty tokens. These can be produced with characters like … that + # Sudachi normalizes internally. + words = [ww for ww in words if len(ww.surface) > 0] + return words + + +def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): + words = [x.surface for x in dtokens] + if "".join("".join(words).split()) != "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + text_words = [] + text_lemmas = [] + text_tags = [] + text_spaces = [] + text_pos = 0 + # handle empty and whitespace-only texts + if len(words) == 0: + return text_words, text_lemmas, text_tags, text_spaces + elif len([word for word in words if not word.isspace()]) == 0: + assert text.isspace() + text_words = [text] + text_lemmas = [text] + text_tags = [gap_tag] + text_spaces = [False] + return text_words, text_lemmas, text_tags, text_spaces + # normalize words to remove all whitespace tokens + norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) + # align words with text + for word, dtoken in zip(norm_words, norm_dtokens): + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) + if word_start > 0: + w = text[text_pos:text_pos + word_start] + text_words.append(w) + text_lemmas.append(w) + text_tags.append(gap_tag) + text_spaces.append(False) + text_pos += word_start + text_words.append(word) + text_lemmas.append(dtoken.lemma) + text_tags.append(dtoken.pos) + text_spaces.append(False) + text_pos += len(word) + if text_pos < len(text) and text[text_pos] == " ": + text_spaces[-1] = True + text_pos += 1 + if text_pos < len(text): + w = text[text_pos:] + text_words.append(w) + text_lemmas.append(w) + text_tags.append(gap_tag) + text_spaces.append(False) + return text_words, text_lemmas, text_tags, text_spaces class JapaneseTokenizer(DummyTokenizer): - def __init__(self, cls, nlp=None): + def __init__(self, cls, nlp=None, config={}): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_fugashi_import().Tagger() - self.tokenizer.parseToNodeList("") # see #2901 + self.split_mode = config.get("split_mode", None) + self.tokenizer = try_sudachi_import(self.split_mode) def __call__(self, text): - dtokens, spaces = get_words_and_spaces(self.tokenizer, text) - words = [x.surface for x in dtokens] + dtokens = get_dtokens(self.tokenizer, text) + + words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) doc = Doc(self.vocab, words=words, spaces=spaces) - unidic_tags = [] - for token, dtoken in zip(doc, dtokens): - unidic_tags.append(dtoken.pos) - token.tag_ = resolve_pos(dtoken) + next_pos = None + for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): + token.tag_ = unidic_tag[0] + if next_pos: + token.pos = next_pos + next_pos = None + else: + token.pos, next_pos = resolve_pos( + token.orth_, + unidic_tag, + unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None + ) # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = dtoken.feature.lemma or dtoken.surface + token.lemma_ = lemma doc.user_data["unidic_tags"] = unidic_tags + return doc + def _get_config(self): + config = OrderedDict( + ( + ("split_mode", self.split_mode), + ) + ) + return config + + def _set_config(self, config={}): + self.split_mode = config.get("split_mode", None) + + def to_bytes(self, **kwargs): + serializers = OrderedDict( + ( + ("cfg", lambda: srsly.json_dumps(self._get_config())), + ) + ) + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + deserializers = OrderedDict( + ( + ("cfg", lambda b: self._set_config(srsly.json_loads(b))), + ) + ) + util.from_bytes(data, deserializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: srsly.write_json(p, self._get_config())), + ) + ) + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: self._set_config(srsly.read_json(p))), + ) + ) + util.from_disk(path, serializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP + syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod - def create_tokenizer(cls, nlp=None): - return JapaneseTokenizer(cls, nlp) + def create_tokenizer(cls, nlp=None, config={}): + return JapaneseTokenizer(cls, nlp, config) class Japanese(Language): diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py new file mode 100644 index 000000000..7c3eee336 --- /dev/null +++ b/spacy/lang/ja/bunsetu.py @@ -0,0 +1,144 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + + +POS_PHRASE_MAP = { + "NOUN": "NP", + "NUM": "NP", + "PRON": "NP", + "PROPN": "NP", + + "VERB": "VP", + + "ADJ": "ADJP", + + "ADV": "ADVP", + + "CCONJ": "CCONJP", +} + + +# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] +def yield_bunsetu(doc, debug=False): + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + prev = None + prev_tag = None + prev_dep = None + prev_head = None + for t in doc: + pos = t.pos_ + pos_type = POS_PHRASE_MAP.get(pos, None) + tag = t.tag_ + dep = t.dep_ + head = t.head.i + if debug: + print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + + # DET is always an individual bunsetu + if pos == "DET": + if bunsetu: + yield bunsetu, phrase_type, phrase + yield [t], None, None + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + + # PRON or Open PUNCT always splits bunsetu + elif tag == "補助記号-括弧開": + if bunsetu: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = True + phrase_type = None + phrase = None + + # bunsetu head not appeared + elif phrase_type is None: + if bunsetu and prev_tag == "補助記号-読点": + yield bunsetu, phrase_type, phrase + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + bunsetu.append(t) + if pos_type: # begin phrase + phrase = [t] + phrase_type = pos_type + if pos_type in {"ADVP", "CCONJP"}: + bunsetu_may_end = True + + # entering new bunsetu + elif pos_type and ( + pos_type != phrase_type or # different phrase type arises + bunsetu_may_end # same phrase type but bunsetu already ended + ): + # exceptional case: NOUN to VERB + if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + bunsetu.append(t) + phrase_type = "VP" + phrase.append(t) + # exceptional case: VERB to NOUN + elif phrase_type == "VP" and pos_type == "NP" and ( + prev_dep == 'compound' and prev_head == t.i or + dep == 'compound' and prev == head or + prev_dep == 'nmod' and prev_head == t.i + ): + bunsetu.append(t) + phrase_type = "NP" + phrase.append(t) + else: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = False + phrase_type = pos_type + phrase = [t] + + # NOUN bunsetu + elif phrase_type == "NP": + bunsetu.append(t) + if not bunsetu_may_end and (( + (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # VERB bunsetu + elif phrase_type == "VP": + bunsetu.append(t) + if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + phrase.append(t) + else: + bunsetu_may_end = True + + # ADJ bunsetu + elif phrase_type == "ADJP" and tag != '連体詞': + bunsetu.append(t) + if not bunsetu_may_end and (( + pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # other bunsetu + else: + bunsetu.append(t) + + prev = t.i + prev_tag = t.tag_ + prev_dep = t.dep_ + prev_head = head + + if bunsetu: + yield bunsetu, phrase_type, phrase diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py new file mode 100644 index 000000000..cd1e4fde7 --- /dev/null +++ b/spacy/lang/ja/syntax_iterators.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON, VERB + +# XXX this can probably be pruned a bit +labels = [ + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", +] + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings.add(label) for label in labels] + conj = doc.vocab.strings.add("conj") + np_label = doc.vocab.strings.add("NP") + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + unseen = [w.i for w in word.subtree if w.i not in seen] + if not unseen: + continue + + # this takes care of particles etc. + seen.update(j.i for j in word.subtree) + # This avoids duplicating embedded clauses + seen.update(range(word.i + 1)) + + # if the head of this is a verb, mark that and rights seen + # Don't do the subtree as that can hide other phrases + if word.head.pos == VERB: + seen.add(word.head.i) + seen.update(w.i for w in word.head.rights) + yield unseen[0], word.i + 1, np_label + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py new file mode 100644 index 000000000..5ed9aec89 --- /dev/null +++ b/spacy/lang/ja/tag_bigram_map.py @@ -0,0 +1,37 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB + +# mapping from tag bi-gram to pos of previous token +TAG_BIGRAM_MAP = { + # This covers only small part of AUX. + ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), + + ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), + # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), + + # This covers acl, advcl, obl and root, but has side effect for compound. + ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), + # This covers almost all of the deps + ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), + + ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), + ("副詞", "動詞-非自立可能"): (None, VERB), + ("形容詞-一般", "動詞-非自立可能"): (None, VERB), + ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB), + ("接頭辞", "動詞-非自立可能"): (None, VERB), + ("助詞-係助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), + ("補助記号-読点", "動詞-非自立可能"): (None, VERB), + + ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), + + ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), + ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), + + ("動詞-一般", "助詞-副助詞"): (None, PART), + ("動詞-非自立可能", "助詞-副助詞"): (None, PART), + ("助動詞", "助詞-副助詞"): (None, PART), +} diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..ad416e109 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,82 +1,104 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN +from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, CCONJ, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE TAG_MAP = { # Explanation of Unidic tags: # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf - # Universal Dependencies Mapping: + # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below) # http://universaldependencies.org/ja/overview/morphology.html # http://universaldependencies.org/ja/pos/all.html - "記号,一般,*,*": { - POS: PUNCT + "記号-一般": { + POS: NOUN }, # this includes characters used to represent sounds like ドレミ - "記号,文字,*,*": { - POS: PUNCT - }, # this is for Greek and Latin characters used as sumbols, as in math - "感動詞,フィラー,*,*": {POS: INTJ}, - "感動詞,一般,*,*": {POS: INTJ}, - # this is specifically for unicode full-width space - "空白,*,*,*": {POS: X}, - # This is used when sequential half-width spaces are present + "記号-文字": { + POS: NOUN + }, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math + "感動詞-フィラー": {POS: INTJ}, + "感動詞-一般": {POS: INTJ}, + "空白": {POS: SPACE}, - "形状詞,一般,*,*": {POS: ADJ}, - "形状詞,タリ,*,*": {POS: ADJ}, - "形状詞,助動詞語幹,*,*": {POS: ADJ}, - "形容詞,一般,*,*": {POS: ADJ}, - "形容詞,非自立可能,*,*": {POS: AUX}, # XXX ADJ if alone, AUX otherwise - "助詞,格助詞,*,*": {POS: ADP}, - "助詞,係助詞,*,*": {POS: ADP}, - "助詞,終助詞,*,*": {POS: PART}, - "助詞,準体助詞,*,*": {POS: SCONJ}, # の as in 走るのが速い - "助詞,接続助詞,*,*": {POS: SCONJ}, # verb ending て - "助詞,副助詞,*,*": {POS: PART}, # ばかり, つつ after a verb - "助動詞,*,*,*": {POS: AUX}, - "接続詞,*,*,*": {POS: SCONJ}, # XXX: might need refinement - "接頭辞,*,*,*": {POS: NOUN}, - "接尾辞,形状詞的,*,*": {POS: ADJ}, # がち, チック - "接尾辞,形容詞的,*,*": {POS: ADJ}, # -らしい - "接尾辞,動詞的,*,*": {POS: NOUN}, # -じみ - "接尾辞,名詞的,サ変可能,*": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* - "接尾辞,名詞的,一般,*": {POS: NOUN}, - "接尾辞,名詞的,助数詞,*": {POS: NOUN}, - "接尾辞,名詞的,副詞可能,*": {POS: NOUN}, # -後, -過ぎ - "代名詞,*,*,*": {POS: PRON}, - "動詞,一般,*,*": {POS: VERB}, - "動詞,非自立可能,*,*": {POS: VERB}, # XXX VERB if alone, AUX otherwise - "動詞,非自立可能,*,*,AUX": {POS: AUX}, - "動詞,非自立可能,*,*,VERB": {POS: VERB}, - "副詞,*,*,*": {POS: ADV}, - "補助記号,AA,一般,*": {POS: SYM}, # text art - "補助記号,AA,顔文字,*": {POS: SYM}, # kaomoji - "補助記号,一般,*,*": {POS: SYM}, - "補助記号,括弧開,*,*": {POS: PUNCT}, # open bracket - "補助記号,括弧閉,*,*": {POS: PUNCT}, # close bracket - "補助記号,句点,*,*": {POS: PUNCT}, # period or other EOS marker - "補助記号,読点,*,*": {POS: PUNCT}, # comma - "名詞,固有名詞,一般,*": {POS: PROPN}, # general proper noun - "名詞,固有名詞,人名,一般": {POS: PROPN}, # person's name - "名詞,固有名詞,人名,姓": {POS: PROPN}, # surname - "名詞,固有名詞,人名,名": {POS: PROPN}, # first name - "名詞,固有名詞,地名,一般": {POS: PROPN}, # place name - "名詞,固有名詞,地名,国": {POS: PROPN}, # country name - "名詞,助動詞語幹,*,*": {POS: AUX}, - "名詞,数詞,*,*": {POS: NUM}, # includes Chinese numerals - "名詞,普通名詞,サ変可能,*": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun - "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB}, - "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN}, # ex: 下手 - "名詞,普通名詞,一般,*": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 - "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ}, - "名詞,普通名詞,助数詞可能,*": {POS: NOUN}, # counter / unit - "名詞,普通名詞,副詞可能,*": {POS: NOUN}, - "連体詞,*,*,*": {POS: ADJ}, # XXX this has exceptions based on literal token - "連体詞,*,*,*,ADJ": {POS: ADJ}, - "連体詞,*,*,*,PRON": {POS: PRON}, - "連体詞,*,*,*,DET": {POS: DET}, + + "形状詞-一般": {POS: ADJ}, + "形状詞-タリ": {POS: ADJ}, + "形状詞-助動詞語幹": {POS: AUX}, + + "形容詞-一般": {POS: ADJ}, + + "形容詞-非自立可能": {POS: ADJ}, # XXX ADJ if alone, AUX otherwise + + "助詞-格助詞": {POS: ADP}, + + "助詞-係助詞": {POS: ADP}, + + "助詞-終助詞": {POS: PART}, + "助詞-準体助詞": {POS: SCONJ}, # の as in 走るのが速い + "助詞-接続助詞": {POS: SCONJ}, # verb ending て0 + + "助詞-副助詞": {POS: ADP}, # ばかり, つつ after a verb + + "助動詞": {POS: AUX}, + + "接続詞": {POS: CCONJ}, # XXX: might need refinement + "接頭辞": {POS: NOUN}, + "接尾辞-形状詞的": {POS: PART}, # がち, チック + + "接尾辞-形容詞的": {POS: AUX}, # -らしい + + "接尾辞-動詞的": {POS: PART}, # -じみ + "接尾辞-名詞的-サ変可能": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞-名詞的-一般": {POS: NOUN}, + "接尾辞-名詞的-助数詞": {POS: NOUN}, + "接尾辞-名詞的-副詞可能": {POS: NOUN}, # -後, -過ぎ + + "代名詞": {POS: PRON}, + + "動詞-一般": {POS: VERB}, + + "動詞-非自立可能": {POS: AUX}, # XXX VERB if alone, AUX otherwise + + "副詞": {POS: ADV}, + + "補助記号-AA-一般": {POS: SYM}, # text art + "補助記号-AA-顔文字": {POS: PUNCT}, # kaomoji + + "補助記号-一般": {POS: SYM}, + + "補助記号-括弧開": {POS: PUNCT}, # open bracket + "補助記号-括弧閉": {POS: PUNCT}, # close bracket + "補助記号-句点": {POS: PUNCT}, # period or other EOS marker + "補助記号-読点": {POS: PUNCT}, # comma + + "名詞-固有名詞-一般": {POS: PROPN}, # general proper noun + "名詞-固有名詞-人名-一般": {POS: PROPN}, # person's name + "名詞-固有名詞-人名-姓": {POS: PROPN}, # surname + "名詞-固有名詞-人名-名": {POS: PROPN}, # first name + "名詞-固有名詞-地名-一般": {POS: PROPN}, # place name + "名詞-固有名詞-地名-国": {POS: PROPN}, # country name + + "名詞-助動詞語幹": {POS: AUX}, + "名詞-数詞": {POS: NUM}, # includes Chinese numerals + + "名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + + "名詞-普通名詞-サ変形状詞可能": {POS: NOUN}, + + "名詞-普通名詞-一般": {POS: NOUN}, + + "名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 + + "名詞-普通名詞-助数詞可能": {POS: NOUN}, # counter / unit + + "名詞-普通名詞-副詞可能": {POS: NOUN}, + + "連体詞": {POS: DET}, # XXX this has exceptions based on literal token + + # GSD tags. These aren't in Unidic, but we need them for the GSD data. + "外国語": {POS: PROPN}, # Foreign words + + "絵文字・記号等": {POS: SYM}, # emoji / kaomoji ^^; + } diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py new file mode 100644 index 000000000..355cc655b --- /dev/null +++ b/spacy/lang/ja/tag_orth_map.py @@ -0,0 +1,30 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X + +# mapping from tag bi-gram to pos of previous token +TAG_ORTH_MAP = { + "空白": { + " ": SPACE, + " ": X, + }, + "助詞-副助詞": { + "たり": PART, + }, + "連体詞": { + "あの": DET, + "かの": DET, + "この": DET, + "その": DET, + "どの": DET, + "彼の": DET, + "此の": DET, + "其の": DET, + "ある": PRON, + "こんな": PRON, + "そんな": PRON, + "どんな": PRON, + "あらゆる": PRON, + }, +} diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index d0d843b2a..8b8d7fe27 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -6,98 +6,73 @@ from ...parts_of_speech import NAMES class PolishLemmatizer(Lemmatizer): - # This lemmatizer implements lookup lemmatization based on - # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS - # It utilizes some prefix based improvements for - # verb and adjectives lemmatization, as well as case-sensitive - # lemmatization for nouns - def __init__(self, lookups, *args, **kwargs): - # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super(PolishLemmatizer, self).__init__(lookups) - self.lemma_lookups = {} - for tag in [ - "ADJ", - "ADP", - "ADV", - "AUX", - "NOUN", - "NUM", - "PART", - "PRON", - "VERB", - "X", - ]: - self.lemma_lookups[tag] = self.lookups.get_table( - "lemma_lookup_" + tag.lower(), {} - ) - self.lemma_lookups["DET"] = self.lemma_lookups["X"] - self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"] - + # This lemmatizer implements lookup lemmatization based on the Morfeusz + # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. + # It utilizes some prefix based improvements for verb and adjectives + # lemmatization, as well as case-sensitive lemmatization for nouns. def __call__(self, string, univ_pos, morphology=None): if isinstance(univ_pos, int): univ_pos = NAMES.get(univ_pos, "X") univ_pos = univ_pos.upper() + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + if univ_pos == "NOUN": - return self.lemmatize_noun(string, morphology) + return self.lemmatize_noun(string, morphology, lookup_table) if univ_pos != "PROPN": string = string.lower() if univ_pos == "ADJ": - return self.lemmatize_adj(string, morphology) + return self.lemmatize_adj(string, morphology, lookup_table) elif univ_pos == "VERB": - return self.lemmatize_verb(string, morphology) + return self.lemmatize_verb(string, morphology, lookup_table) - lemma_dict = self.lemma_lookups.get(univ_pos, {}) - return [lemma_dict.get(string, string.lower())] + return [lookup_table.get(string, string.lower())] - def lemmatize_adj(self, string, morphology): + def lemmatize_adj(self, string, morphology, lookup_table): # this method utilizes different procedures for adjectives # with 'nie' and 'naj' prefixes - lemma_dict = self.lemma_lookups["ADJ"] - if string[:3] == "nie": search_string = string[3:] if search_string[:3] == "naj": naj_search_string = search_string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] if string[:3] == "naj": naj_search_string = string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_verb(self, string, morphology): + def lemmatize_verb(self, string, morphology, lookup_table): # this method utilizes a different procedure for verbs # with 'nie' prefix - lemma_dict = self.lemma_lookups["VERB"] - if string[:3] == "nie": search_string = string[3:] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_noun(self, string, morphology): + def lemmatize_noun(self, string, morphology, lookup_table): # this method is case-sensitive, in order to work # for incorrectly tagged proper names - lemma_dict = self.lemma_lookups["NOUN"] - if string != string.lower(): - if string.lower() in lemma_dict: - return [lemma_dict[string.lower()]] - elif string in lemma_dict: - return [lemma_dict[string]] + if string.lower() in lookup_table: + return [lookup_table[string.lower()]] + elif string in lookup_table: + return [lookup_table[string]] return [string.lower()] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] def lookup(self, string, orth=None): return string.lower() diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..c34e77129 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,4 +18,9 @@ sentences = [ "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.", "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்", "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது", + "இது ஒரு வாக்கியம்.", + "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", + "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", + "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." ] diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 29ce75442..67349916b 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .char_classes import ALPHA_LOWER +from .char_classes import ALPHA_LOWER, ALPHA from ..symbols import ORTH, POS, TAG, LEMMA, SPACE @@ -58,7 +58,8 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match +TOKEN_MATCH = None +URL_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 41e2d2158..f9b5389ac 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X -from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE +from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE, PROPN # The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn # Treebank tag set. We also map the tags to the simpler Universal Dependencies @@ -28,7 +28,7 @@ TAG_MAP = { "URL": {POS: X}, "INF": {POS: X}, "NN": {POS: NOUN}, - "NR": {POS: NOUN}, + "NR": {POS: PROPN}, "NT": {POS: NOUN}, "VA": {POS: VERB}, "VC": {POS: VERB}, diff --git a/spacy/language.py b/spacy/language.py index d5bd879e9..2058def8a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc @@ -89,6 +89,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match + url_match = cls.url_match prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -106,10 +107,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, + url_match=url_match, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH + url_match = URL_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) @@ -931,15 +934,26 @@ class Language(object): DOCS: https://spacy.io/api/language#from_disk """ + def deserialize_meta(path): + if path.exists(): + data = srsly.read_json(path) + self.meta.update(data) + # self.meta always overrides meta["vectors"] with the metadata + # from self.vocab.vectors, so set the name directly + self.vocab.vectors.name = data.get("vectors", {}).get("name") + + def deserialize_vocab(path): + if path.exists(): + self.vocab.from_disk(path) + _fix_pretrained_vectors_name(self) + if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() - deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) - deserializers["vocab"] = lambda p: self.vocab.from_disk( - p - ) and _fix_pretrained_vectors_name(self) + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( p, exclude=["vocab"] ) @@ -993,14 +1007,23 @@ class Language(object): DOCS: https://spacy.io/api/language#from_bytes """ + def deserialize_meta(b): + data = srsly.json_loads(b) + self.meta.update(data) + # self.meta always overrides meta["vectors"] with the metadata + # from self.vocab.vectors, so set the name directly + self.vocab.vectors.name = data.get("vectors", {}).get("name") + + def deserialize_vocab(b): + self.vocab.from_bytes(b) + _fix_pretrained_vectors_name(self) + if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = OrderedDict() - deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) - deserializers["vocab"] = lambda b: self.vocab.from_bytes( - b - ) and _fix_pretrained_vectors_name(self) + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( b, exclude=["vocab"] ) @@ -1066,7 +1089,7 @@ class component(object): def _fix_pretrained_vectors_name(nlp): # TODO: Replace this once we handle vectors consistently as static # data - if "vectors" in nlp.meta and nlp.meta["vectors"].get("name"): + if "vectors" in nlp.meta and "name" in nlp.meta["vectors"]: nlp.vocab.vectors.name = nlp.meta["vectors"]["name"] elif not nlp.vocab.vectors.size: nlp.vocab.vectors.name = None diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index dec2993fa..1df516dcb 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -12,7 +12,6 @@ import numpy import warnings from thinc.neural.util import get_array_module -from libc.stdint cimport UINT64_MAX from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP @@ -23,7 +22,7 @@ from .attrs import intify_attrs from .errors import Errors, Warnings -OOV_RANK = UINT64_MAX +OOV_RANK = 0xffffffffffffffff # UINT64_MAX memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) EMPTY_LEXEME.id = OOV_RANK diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index b66ec35b8..00c3357f5 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -332,7 +332,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr): matcher = PhraseMatcher(vocab, attr=attr) for key, specs in docs.items(): callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) + matcher.add(key, specs, on_match=callback) return matcher diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c146094a9..a9bab38ed 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -152,7 +152,10 @@ cdef class Morphology: self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. - space_attrs = tag_map.get('SP', {POS: SPACE}) + if '_SP' in tag_map: + space_attrs = tag_map.get('_SP') + else: + space_attrs = tag_map.get('SP', {POS: SPACE}) if '_SP' not in tag_map: self.strings.add('_SP') tag_map = dict(tag_map) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ccd847ef1..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): @@ -526,6 +528,8 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} + if "_SP" in orig_tag_map: + new_tag_map["_SP"] = orig_tag_map["_SP"] cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, @@ -1168,6 +1172,9 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + + # how many neightbour sentences to take into account + self.n_sents = cfg.get("n_sents", 0) def set_kb(self, kb): self.kb = kb @@ -1216,6 +1223,9 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): ents_by_offset = dict() + + sentences = [s for s in doc.sents] + for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent @@ -1226,17 +1236,34 @@ class EntityLinker(Pipe): # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt if not (start, end) in ents_by_offset: raise RuntimeError(Errors.E188) + ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: try: - sentence_docs.append(ent.sent.as_doc()) + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + # append that span as a doc to training + sent_doc = doc[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) bp_context(d_scores, sgd=sgd) @@ -1307,69 +1334,81 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + for sent_index, sent in enumerate(sentences): + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in sent_doc.ents: - entity_count += 1 + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) + sent_doc = doc[start_token:end_token].as_doc() - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + + for ent in sent.ents: + entity_count += 1 + + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fafa492c6..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,6 +9,7 @@ import numpy cimport cython.parallel import numpy.random cimport numpy as np +from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -25,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -36,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -600,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): @@ -620,15 +624,15 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() - docs = [] - golds = [] - for raw_text, annots_brackets in get_gold_tuples(): + doc_sample = [] + gold_sample = [] + for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - docs.append(Doc(self.vocab, words=words)) - golds.append(GoldParse(docs[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(docs, golds) + doc_sample.append(Doc(self.vocab, words=words)) + gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, + heads=heads, deps=deps, entities=ents)) + self.model.begin_training(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 63bbf2e0a..1f13da5d6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -140,7 +140,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("fugashi") + pytest.importorskip("sudachipy") return get_lang_class("ja").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index a78e1815f..1ff64eff2 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -46,7 +46,7 @@ def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"]) +@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll", "this'll", "those'll"]) def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..58cd3f3bf 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -6,7 +6,7 @@ import pytest @pytest.mark.parametrize( "word,lemma", - [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")], + [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")], ) def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): test_lemma = ja_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py new file mode 100644 index 000000000..018e645bb --- /dev/null +++ b/spacy/tests/lang/ja/test_serialize.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.ja import Japanese +from ...util import make_tempdir + + +def test_ja_tokenizer_serialize(ja_tokenizer): + tokenizer_bytes = ja_tokenizer.to_bytes() + nlp = Japanese() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.split_mode == None + + with make_tempdir() as d: + file_path = d / "tokenizer" + ja_tokenizer.to_disk(file_path) + nlp = Japanese() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.split_mode == None + + # split mode is (de)serialized correctly + nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_r = Japanese() + nlp_bytes = nlp.to_bytes() + nlp_r.from_bytes(nlp_bytes) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.split_mode == "B" + + with make_tempdir() as d: + nlp.to_disk(d) + nlp_r = Japanese() + nlp_r.from_disk(d) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.split_mode == "B" diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..26be5cf59 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import pytest +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS +from spacy.lang.ja import Japanese # fmt: off TOKENIZER_TESTS = [ @@ -14,20 +16,26 @@ TOKENIZER_TESTS = [ ] TAG_TESTS = [ - ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']), - ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']), - ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']), - ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']), - ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*']) + ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']), + ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']), + ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']), + ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']), + ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能']) ] POS_TESTS = [ - ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']), + ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']), ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']), ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']), ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']), ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN']) ] + +SENTENCE_TESTS = [ + ('あれ。これ。', ['あれ。', 'これ。']), + ('「伝染るんです。」という漫画があります。', + ['「伝染るんです。」という漫画があります。']), + ] # fmt: on @@ -43,14 +51,55 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags +#XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos -def test_extra_spaces(ja_tokenizer): +@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") +@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) +def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): + sents = [str(sent) for sent in ja_tokenizer(text).sents] + assert sents == expected_sents + + +def test_ja_tokenizer_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") - assert tokens[1].orth_ == " " - assert tokens[2].orth_ == " " + assert tokens[1].orth_ == " " + + +@pytest.mark.parametrize("text", NAUGHTY_STRINGS) +def test_ja_tokenizer_naughty_strings(ja_tokenizer, text): + tokens = ja_tokenizer(text) + assert tokens.text_with_ws == text + + +@pytest.mark.parametrize("text,len_a,len_b,len_c", + [ + ("選挙管理委員会", 4, 3, 1), + ("客室乗務員", 3, 2, 1), + ("労働者協同組合", 4, 3, 1), + ("機能性食品", 3, 2, 1), + ] +) +def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): + nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) + nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) + + assert len(ja_tokenizer(text)) == len_a + assert len(nlp_a(text)) == len_a + assert len(nlp_b(text)) == len_b + assert len(nlp_c(text)) == len_c + + +def test_ja_tokenizer_emptyish_texts(ja_tokenizer): + doc = ja_tokenizer("") + assert len(doc) == 0 + doc = ja_tokenizer(" ") + assert len(doc) == 1 + doc = ja_tokenizer("\n\n\n \t\t \n\n\n") + assert len(doc) == 1 diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7a6585e06..60aa584ef 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import srsly from mock import Mock from spacy.matcher import PhraseMatcher from spacy.tokens import Doc @@ -266,3 +267,26 @@ def test_phrase_matcher_basic_check(en_vocab): pattern = Doc(en_vocab, words=["hello", "world"]) with pytest.raises(ValueError): matcher.add("TEST", pattern) + + +def test_phrase_matcher_pickle(en_vocab): + matcher = PhraseMatcher(en_vocab) + mock = Mock() + matcher.add("TEST", [Doc(en_vocab, words=["test"])]) + matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock) + doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"]) + assert len(matcher) == 2 + + b = srsly.pickle_dumps(matcher) + matcher_unpickled = srsly.pickle_loads(b) + + # call after pickling to avoid recursion error related to mock + matches = matcher(doc) + matches_unpickled = matcher_unpickled(doc) + + assert len(matcher) == len(matcher_unpickled) + assert matches == matches_unpickled + + # clunky way to vaguely check that callback is unpickled + (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] + assert isinstance(callbacks.get("TEST2"), Mock) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..758ac9c14 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 337c82255..2b14ff589 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,16 +1,17 @@ # coding: utf8 import warnings from unittest import TestCase - import pytest import srsly from numpy import zeros from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors - from spacy.language import Language from spacy.pipeline import Pipe -from spacy.tests.util import make_tempdir +from spacy.compat import is_python2 + + +from ..util import make_tempdir def nlp(): @@ -96,12 +97,14 @@ def write_obj_and_catch_warnings(obj): return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) +@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) assert len(warnings_list) == 0 +@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") def test_writer_with_path_py35(): writer = None with make_tempdir() as d: @@ -132,11 +135,13 @@ def test_save_and_load_knowledge_base(): pytest.fail(str(e)) -class TestToDiskResourceWarningUnittest(TestCase): - def test_resource_warning(self): - scenarios = zip(*objects_to_test) +if not is_python2: - for scenario in scenarios: - with self.subTest(msg=scenario[1]): - warnings_list = write_obj_and_catch_warnings(scenario[0]) - self.assertEqual(len(warnings_list), 0) + class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) + + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py index 33281c858..3281e2a8c 100644 --- a/spacy/tests/regression/test_issue5458.py +++ b/spacy/tests/regression/test_issue5458.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tests.util import get_doc @@ -6,11 +9,13 @@ from spacy.vocab import Vocab def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans + # fmt: off words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] vocab = Vocab(strings=words) dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + # fmt: on en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) en_doc.noun_chunks_iterator = noun_chunks diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 3be0a75b3..4727899a3 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -5,6 +5,7 @@ import pytest import pickle from spacy.vocab import Vocab from spacy.strings import StringStore +from spacy.compat import is_python2 from ..util import make_tempdir @@ -134,6 +135,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) != list(sstore2_d) +@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 3ac621649..d48ba24a2 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,12 +4,14 @@ from __future__ import unicode_literals import pytest import os import ctypes +import srsly from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu from spacy.compat import symlink_to, symlink_remove, path2str, is_windows from spacy._ml import PrecomputableAffine from subprocess import CalledProcessError +from .util import make_tempdir @pytest.fixture @@ -146,3 +148,33 @@ def test_load_model_blank_shortcut(): assert nlp.pipeline == [] with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof") + + +def test_load_model_version_compat(): + """Test warnings for various spacy_version specifications in meta. Since + this is more of a hack for v2, manually specify the current major.minor + version to simplify test creation.""" + nlp = util.load_model("blank:en") + assert nlp.meta["spacy_version"].startswith(">=2.3") + with make_tempdir() as d: + # no change: compatible + nlp.to_disk(d) + meta_path = Path(d / "meta.json") + util.get_model_meta(d) + + # additional compatible upper pin + nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" + srsly.write_json(meta_path, nlp.meta) + util.get_model_meta(d) + + # incompatible older version + nlp.meta["spacy_version"] = ">=2.2.5" + srsly.write_json(meta_path, nlp.meta) + with pytest.warns(UserWarning): + util.get_model_meta(d) + + # invalid version specification + nlp.meta["spacy_version"] = ">@#$%_invalid_version" + srsly.write_json(meta_path, nlp.meta) + with pytest.warns(UserWarning): + util.get_model_meta(d) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 58e9d73f3..65ba93d66 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is not None + assert en_tokenizer.url_match(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is None + assert en_tokenizer.url_match(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 1821f8abc..576ca93d2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -10,6 +10,7 @@ from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc +from spacy.compat import is_python2 from ..util import add_vecs_to_vocab, make_tempdir @@ -339,6 +340,7 @@ def test_vocab_prune_vectors(): assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) +@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") def test_vectors_serialize(): data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..694ea49cc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,6 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match + cdef object _url_match cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 69d6285e1..154a42c4f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,8 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, - suffix_search=None, infix_finditer=None, token_match=None): + suffix_search=None, infix_finditer=None, token_match=None, + url_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -43,6 +44,8 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. + url_match (callable): A boolean function matching strings to be + recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. EXAMPLE: @@ -55,6 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match + self.url_match = url_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -70,6 +74,14 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() + property url_match: + def __get__(self): + return self._url_match + + def __set__(self, url_match): + self._url_match = url_match + self._flush_cache() + property prefix_search: def __get__(self): return self._prefix_search @@ -108,11 +120,12 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, - self._rules, + self.rules, self.prefix_search, self.suffix_search, self.infix_finditer, - self.token_match) + self.token_match, + self.url_match) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -240,6 +253,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 break @@ -295,7 +310,9 @@ cdef class Tokenizer: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass - elif self.token_match and self.token_match(string): + elif (self.token_match and self.token_match(string)) or \ + (self.url_match and \ + self.url_match(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -448,6 +465,11 @@ cdef class Tokenizer: suffix_search = self.suffix_search infix_finditer = self.infix_finditer token_match = self.token_match + if token_match is None: + token_match = re.compile("a^").match + url_match = self.url_match + if url_match is None: + url_match = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -456,6 +478,10 @@ cdef class Tokenizer: suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + break if substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -476,12 +502,15 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] - if substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif url_match(substring): + tokens.append(("URL_MATCH", substring)) + substring = '' + elif substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -543,6 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), + ("url_match", lambda: _get_regex_pattern(self.url_match)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -564,11 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), + ("url_match", lambda b: data.setdefault("url_match", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -579,6 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match + if "url_match" in data and isinstance(data["url_match"], basestring_): + self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e09870741..12f2f6cc3 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -46,12 +46,6 @@ cdef class MorphAnalysis: """The number of features in the analysis.""" return self.c.length - def __str__(self): - return self.to_json() - - def __repr__(self): - return self.to_json() - def __hash__(self): return self.key diff --git a/spacy/util.py b/spacy/util.py index 5fd296404..5362952e2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,6 +17,7 @@ import srsly import catalogue import sys import warnings +from . import about try: import jsonschema @@ -250,6 +251,31 @@ def get_model_meta(path): for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) + if "spacy_version" in meta: + about_major_minor = ".".join(about.__version__.split(".")[:2]) + if not meta["spacy_version"].startswith(">=" + about_major_minor): + # try to simplify version requirements from model meta to vx.x + # for warning message + meta_spacy_version = "v" + ".".join( + meta["spacy_version"].replace(">=", "").split(".")[:2] + ) + # if the format is unexpected, supply the full version + if not re.match(r"v\d+\.\d+", meta_spacy_version): + meta_spacy_version = meta["spacy_version"] + warn_msg = Warnings.W031.format( + model=meta["lang"] + "_" + meta["name"], + model_version=meta["version"], + version=meta_spacy_version, + current=about.__version__, + ) + warnings.warn(warn_msg) + else: + warn_msg = Warnings.W032.format( + model=meta["lang"] + "_" + meta["name"], + model_version=meta["version"], + current=about.__version__, + ) + warnings.warn(warn_msg) return meta diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 51ddc3f9a..aec086e6c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -425,9 +425,9 @@ cdef class Vectors: self.data = xp.load(str(path)) serializers = OrderedDict(( - ("key2row", load_key2row), - ("keys", load_keys), ("vectors", load_vectors), + ("keys", load_keys), + ("key2row", load_key2row), )) util.from_disk(path, serializers, []) self._sync_unset() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 68f0ac0db..1b1b04e13 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -46,7 +46,8 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. lookups_extra (Lookups): Container for optional lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + oov_prob (float): Default OOV probability. + vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 505977be9..6f4b8bb73 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -455,7 +455,7 @@ improvement. ```bash $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] -[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] +[--width] [--conv-depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every] [--init-tok2vec] [--epoch-start] @@ -467,7 +467,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | | `output_dir` | positional | Directory to write models to on each epoch. | | `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--conv-depth`, `-cd` | option | Depth of CNN layers. | | `--cnn-window`, `-cW` 2.2.2 | option | Window size for CNN layers. | | `--cnn-pieces`, `-cP` 2.2.2 | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). | | `--use-chars`, `-chr` 2.2.2 | flag | Whether to use character-based embedding. | @@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..6f8badfe8 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -35,14 +35,15 @@ the > ``` | Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 70411ec0b..29de08266 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -288,7 +288,7 @@ common spelling. This has no effect on any other token attributes, or tokenization in general, but it ensures that **equivalent tokens receive similar representations**. This can improve the model's predictions on words that weren't common in the training data, but are equivalent to other words – for -example, "realize" and "realize", or "thx" and "thanks". +example, "realise" and "realize", or "thx" and "thanks". Similarly, spaCy also includes [global base norms](https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index d17e5a661..bcc943436 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -738,6 +738,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(substring) + substring = '' + break if substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -752,12 +756,15 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, split = suffix_search(substring).start() suffixes.append(substring[split:]) substring = substring[:split] - if substring in special_cases: - tokens.extend(special_cases[substring]) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(substring) substring = '' + elif url_match(substring): + tokens.append(substring) + substring = '' + elif substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -778,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Check whether we have an explicitly defined rule for this substring. If we - do, use it. -3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that special cases always get priority. -4. If we didn't consume a prefix, try to consume a suffix and then go back to +2. Look for a token match. If there is a match, stop processing and keep this + token. +3. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to + #2, so that the token match and special cases always get priority. +5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -5. If we can't consume a prefix or a suffix, look for a special case. -6. Next, look for a token match. -7. Look for "infixes" — stuff like hyphens etc. and split the substring into +6. If we can't consume a prefix or a suffix, look for a URL match. +7. If there's no URL match, then look for a special case. +8. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. -8. Once we can't consume any more of the string, handle it as a single token. +9. Once we can't consume any more of the string, handle it as a single token. #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} @@ -832,8 +841,8 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. - Note that prefixes and suffixes will be split off before `token_match` is - applied. +6. An optional boolean function `url_match`, which is similar to `token_match` + except prefixes and suffixes are removed before applying the match. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its diff --git a/website/meta/universe.json b/website/meta/universe.json index aae6855be..2c74a2964 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2235,7 +2235,7 @@ "", "nlp = spacy.load('en_core_web_sm')", "nlp.add_pipe(LanguageDetector())", - "doc = nlp('Life is like a box of chocolates. You never know what you're gonna get.')", + "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "", "assert doc._.language == 'en'", "assert doc._.language_score >= 0.8" diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..3c5e9d2a4 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -1,4 +1,4 @@ -import React, { useEffect, useState, useMemo } from 'react' +import React, { useEffect, useState, useMemo, Fragment } from 'react' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' @@ -83,15 +83,24 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { + las: 'LAS', + uas: 'UAS', + tags_acc: 'TAG', + ents_f: 'NER F', + ents_p: 'NER P', + ents_r: 'NER R', + } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') - return Object.keys(data).map(key => ({ - label: labels[key] || key.toUpperCase(), - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + return Object.keys(data) + .filter(key => labels[key]) + .map(key => ({ + label: labels[key], + value: data[key].toFixed(2), + help: MODEL_META[key], + type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, + })) } function formatModelMeta(data) { @@ -115,11 +124,11 @@ function formatModelMeta(data) { function formatSources(data = []) { const sources = data.map(s => (isString(s) ? { name: s } : s)) return sources.map(({ name, url, author }, i) => ( - <> + {i > 0 &&
} {name && url ? {name} : name} {author && ` (${author})`} - +
)) } @@ -308,12 +317,12 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl {labelNames.map((label, i) => ( - <> + {i > 0 && ', '} {label} - + ))}