diff --git a/.appveyor.yml b/.appveyor.yml index a379cdd31..dd1824ead 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -4,15 +4,11 @@ environment: # For Python versions available on Appveyor, see # http://www.appveyor.com/docs/installed-software#python - # The list here is complete (excluding Python 2.6, which - # isn't covered by this document) at the time of writing. - PYTHON: "C:\\Python27" - #- PYTHON: "C:\\Python33" #- PYTHON: "C:\\Python34" #- PYTHON: "C:\\Python35" #- PYTHON: "C:\\Python27-x64" - #- PYTHON: "C:\\Python33-x64" #- DISTUTILS_USE_SDK: "1" #- PYTHON: "C:\\Python34-x64" #- DISTUTILS_USE_SDK: "1" @@ -30,7 +26,7 @@ build: off test_script: # Put your test command here. - # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4, + # If you don't need to build C extensions on 64-bit Python 3.4, # you can remove "build.cmd" from the front of the command, as it's # only needed to support those cases. # Note that you must use the environment variable %PYTHON% to refer to @@ -41,7 +37,7 @@ test_script: after_test: # This step builds your wheels. # Again, you only need build.cmd if you're building C extensions for - # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct + # 64-bit Python 3.4. And you need to use %PYTHON% to get the correct # interpreter - "%PYTHON%\\python.exe setup.py bdist_wheel" diff --git a/.github/contributors/MartinoMensio.md b/.github/contributors/MartinoMensio.md new file mode 100644 index 000000000..1cd32d622 --- /dev/null +++ b/.github/contributors/MartinoMensio.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Martino Mensio | +| Company name (if applicable) | Polytechnic University of Turin | +| Title or role (if applicable) | Student | +| Date | 17 November 2017 | +| GitHub username | MartinoMensio | +| Website (optional) | https://martinomensio.github.io/ | diff --git a/.github/contributors/bdewilde.md b/.github/contributors/bdewilde.md new file mode 100644 index 000000000..f65a625cb --- /dev/null +++ b/.github/contributors/bdewilde.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ---------------------------- | +| Name | Burton DeWilde | +| Company name (if applicable) | - | +| Title or role (if applicable) | data scientist | +| Date | 20 November 2017 | +| GitHub username | bdewilde | +| Website (optional) | https://bdewilde.github.io/ | diff --git a/.github/contributors/cclauss.md b/.github/contributors/cclauss.md new file mode 100644 index 000000000..62ebbab09 --- /dev/null +++ b/.github/contributors/cclauss.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Chris Clauss | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 20 November 2017 | +| GitHub username | cclauss | +| Website (optional) | | diff --git a/.github/contributors/fsonntag.md b/.github/contributors/fsonntag.md new file mode 100644 index 000000000..0d84015bb --- /dev/null +++ b/.github/contributors/fsonntag.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Felix Sonntag | +| Company name (if applicable) | - | +| Title or role (if applicable) | Student | +| Date | 2017-11-19 | +| GitHub username | fsonntag | +| Website (optional) | http://github.com/fsonntag/ | diff --git a/.github/contributors/greenriverrus.md b/.github/contributors/greenriverrus.md new file mode 100644 index 000000000..a7d6ba260 --- /dev/null +++ b/.github/contributors/greenriverrus.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | --------------------| +| Name | Vadim Mazaev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 26 November 2017 | +| GitHub username | GreenRiverRUS | +| Website (optional) | | diff --git a/.github/contributors/hugovk.md b/.github/contributors/hugovk.md new file mode 100644 index 000000000..1b96f688c --- /dev/null +++ b/.github/contributors/hugovk.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Hugo van Kemenade | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 26 November 2017 | +| GitHub username | hugovk | +| Website (optional) | | diff --git a/.github/contributors/markulrich.md b/.github/contributors/markulrich.md new file mode 100644 index 000000000..b31bba033 --- /dev/null +++ b/.github/contributors/markulrich.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mark Ulrich | +| Company name (if applicable) | | +| Title or role (if applicable) | Machine Learning Engineer | +| Date | 22 November 2017 | +| GitHub username | markulrich | +| Website (optional) | | diff --git a/.github/contributors/sorenlind.md b/.github/contributors/sorenlind.md new file mode 100644 index 000000000..73e42636f --- /dev/null +++ b/.github/contributors/sorenlind.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Søren Lind Kristiansen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 24 November 2017 | +| GitHub username | sorenlind | +| Website (optional) | | diff --git a/.github/contributors/tokestermw.md b/.github/contributors/tokestermw.md new file mode 100644 index 000000000..2d69d7736 --- /dev/null +++ b/.github/contributors/tokestermw.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Motoki Wu | +| Company name (if applicable) | WriteLab | +| Title or role (if applicable) | NLP / Deep Learning Engineer | +| Date | 17 November 2017 | +| GitHub username | tokestermw | +| Website (optional) | https://twitter.com/plusepsilon | diff --git a/.travis.yml b/.travis.yml index 0bda7276d..8dec277b9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,14 +15,17 @@ os: env: - VIA=compile LC_ALL=en_US.ascii - VIA=compile + - VIA=flake8 #- VIA=pypi_nightly install: - "./travis.sh" + - pip install flake8 script: - "pip install pytest pytest-timeout" - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi + - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0ec363f3a..2bcfd343d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -57,7 +57,7 @@ even format them as Markdown to copy-paste into GitHub issues: * **Checking the model compatibility:** If you're having problems with a [statistical model](https://spacy.io/models), it may be because to the model is incompatible with your spaCy installation. In spaCy v2.0+, you can check -this on the command line by running `spacy validate`. +this on the command line by running `python -m spacy validate`. * **Sharing a model's output, like dependencies and entities:** spaCy v2.0+ comes with [built-in visualizers](https://spacy.io/usage/visualizers) that diff --git a/README.rst b/README.rst index 088cdbd47..a47084254 100644 --- a/README.rst +++ b/README.rst @@ -16,6 +16,10 @@ integration. It's commercial open-source software, released under the MIT licens :target: https://travis-ci.org/explosion/spaCy :alt: Build Status +.. image:: https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square + :target: https://ci.appveyor.com/project/explosion/spaCy + :alt: Appveyor Build Status + .. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square :target: https://github.com/explosion/spaCy/releases :alt: Current Release Version @@ -108,7 +112,7 @@ the `documentation `_. ==================== === **Operating system** macOS / OS X, Linux, Windows (Cygwin, MinGW, Visual Studio) -**Python version** CPython 2.6, 2.7, 3.3+. Only 64 bit. +**Python version** CPython 2.7, 3.4+. Only 64 bit. **Package managers** `pip`_ (source packages only), `conda`_ (via ``conda-forge``) ==================== === diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py index ef920ab00..3c3b8132f 100644 --- a/examples/information_extraction/entity_relations.py +++ b/examples/information_extraction/entity_relations.py @@ -36,7 +36,8 @@ def main(model='en_core_web_sm'): def extract_currency_relations(doc): # merge entities and noun chunks into one token - for span in [*list(doc.ents), *list(doc.noun_chunks)]: + spans = list(doc.ents) + list(doc.noun_chunks) + for span in spans: span.merge() relations = [] diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index cb563ed42..a0edde45c 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -73,7 +73,7 @@ TRAIN_DATA = [ new_model_name=("New model name for model meta.", "option", "nm", str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int)) -def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): +def main(model=None, new_model_name='animal', output_dir=None, n_iter=20): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index e893cb4e4..6eb7213cf 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -30,8 +30,11 @@ TAG_MAP = { 'J': {'pos': 'ADJ'} } -# Usually you'll read this in, of course. Data formats vary. -# Ensure your strings are unicode. +# Usually you'll read this in, of course. Data formats vary. Ensure your +# strings are unicode and that the number of tags assigned matches spaCy's +# tokenization. If not, you can always add a 'words' key to the annotations +# that specifies the gold-standard tokenization, e.g.: +# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']}) TRAIN_DATA = [ ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}), ("Eat blue ham", {'tags': ['V', 'J', 'N']}) diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index 4e5640f0d..a443b5310 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -13,7 +13,7 @@ from spacy.language import Language @plac.annotations( - vectors_loc=("Path to vectors", "positional", None, str), + vectors_loc=("Path to .vec file", "positional", None, str), lang=("Optional language ID. If not set, blank Language() will be used.", "positional", None, str)) def main(vectors_loc, lang=None): @@ -30,7 +30,7 @@ def main(vectors_loc, lang=None): nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') - pieces = line.rsplit(' ', nr_dim) + pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab diff --git a/setup.py b/setup.py index 11acae95c..a588f2c2a 100755 --- a/setup.py +++ b/setup.py @@ -211,9 +211,9 @@ def setup_package(): 'Operating System :: MacOS :: MacOS X', 'Operating System :: Microsoft :: Windows', 'Programming Language :: Cython', - 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 8113ffebe..893ec0845 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): 'NumValue', 'PartType', 'Polite', 'StyleVariant', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', - 'Polarity', # U20 + 'Polarity', 'Animacy' # U20 ] for key in morph_keys: if key in stringy_attrs: diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8c590b49b..3dc7fa3e0 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -146,7 +146,7 @@ def list_files(data_dir): def list_requirements(meta): parent_package = meta.get('parent_package', 'spacy') - requirements = [parent_package + meta['spacy_version']] + requirements = [parent_package + ">=" + meta['spacy_version']] if 'setup_requires' in meta: requirements += meta['setup_requires'] return requirements diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 5c0ed521b..dc2374dab 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -36,13 +36,13 @@ def profile(cmd, lang, inputs=None): if inputs is None: imdb_train, _ = thinc.extra.datasets.imdb() inputs, _ = zip(*imdb_train) - inputs = inputs[:2000] + inputs = inputs[:25000] nlp = spacy.load(lang) texts = list(cytoolz.take(10000, inputs)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("cumtime").print_stats() + s.strip_dirs().sort_stats("time").print_stats() def parse_texts(nlp, texts): diff --git a/spacy/compat.py b/spacy/compat.py index 7cd06e545..e50036013 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -53,9 +53,9 @@ is_osx = sys.platform == 'darwin' if is_python2: import imp bytes_ = str - unicode_ = unicode - basestring_ = basestring - input_ = raw_input + unicode_ = unicode # noqa: F821 + basestring_ = basestring # noqa: F821 + input_ = raw_input # noqa: F821 json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8') path2str = lambda path: str(path).decode('utf8') diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index e160c31b6..cfecd4b11 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -97,7 +97,7 @@ def parse_deps(orig_doc, options={}): word.lemma_, word.ent_type_)) for span_props in spans: doc.merge(*span_props) - words = [{'text': w.text, 'tag': w.tag_} for w in doc] + words = [{'text': w.text, 'tag': w.pos_} for w in doc] arcs = [] for word in doc: if word.i < word.head.i: diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d6db9b853..dff5fc147 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -541,5 +541,24 @@ def biluo_tags_from_offsets(doc, entities, missing='O'): return biluo +def offsets_from_biluo_tags(doc, tags): + """Encode per-token tags following the BILUO scheme into entity offsets. + + doc (Doc): The document that the BILUO tags refer to. + entities (iterable): A sequence of BILUO tags with each tag describing one + token. Each tags string will be of the form of either "", "O" or + "{action}-{label}", where action is one of "B", "I", "L", "U". + RETURNS (list): A sequence of `(start, end, label)` triples. `start` and + `end` will be character-offset integers denoting the slice into the + original string. + """ + token_offsets = tags_to_entities(tags) + offsets = [] + for label, start_idx, end_idx in token_offsets: + span = doc[start_idx : end_idx + 1] + offsets.append((span.start_char, span.end_char, label)) + return offsets + + def is_punct_label(label): return label == 'P' or label.lower() == 'punct' diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 7ec631c92..68d8eecc7 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -15,9 +15,11 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]' _latin_lower = r'[\p{Ll}&&\p{Latin}]' _latin_upper = r'[\p{Lu}&&\p{Latin}]' _latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]' +_russian_lower = r'[ёа-я]' +_russian_upper = r'[ЁА-Я]' -_upper = [_latin_upper] -_lower = [_latin_lower] +_upper = [_latin_upper, _russian_upper] +_lower = [_latin_lower, _russian_lower] _uncased = [_bengali, _hebrew] ALPHA = merge_char_classes(_upper + _lower + _uncased) @@ -27,8 +29,9 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased) _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft ' 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' - 'TB T G M K %') -_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' + 'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм ' + 'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб') +_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽' # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 45e5b89dd..b4e248cf6 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .norm_exceptions import NORM_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES @@ -18,7 +19,8 @@ class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'da' - lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], + BASE_NORMS, NORM_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) # morph_rules = MORPH_RULES tag_map = TAG_MAP diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index 549f71fb5..c98fc251e 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -11,7 +11,7 @@ Example sentences to test spaCy and its language models. sentences = [ - "Apple overvejer at købe et britisk statup for 1 milliard dollar", + "Apple overvejer at købe et britisk startup for 1 milliard dollar", "Selvkørende biler flytter forsikringsansvaret over på producenterne", "San Francisco overvejer at forbyde leverandørrobotter på fortov", "London er en stor by i Storbritannien" diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py new file mode 100644 index 000000000..71e4b741c --- /dev/null +++ b/spacy/lang/da/norm_exceptions.py @@ -0,0 +1,527 @@ +# coding: utf8 +""" +Special-case rules for normalizing tokens to improve the model's predictions. +For example 'mysterium' vs 'mysterie' and similar. +""" +from __future__ import unicode_literals + + +# Sources: +# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/ +# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/ + +_exc = { + # Alternative spelling + "a-kraft-værk": "a-kraftværk", # 1 + "ålborg": "aalborg", # 2 + "århus": "aarhus", + "accessoirer": "accessoires", # 1 + "affektert": "affekteret", # 1 + "afrikander": "afrikaaner", # 1 + "aftabuere": "aftabuisere", # 1 + "aftabuering": "aftabuisering", # 1 + "akvarium": "akvarie", # 1 + "alenefader": "alenefar", # 1 + "alenemoder": "alenemor", # 1 + "alkoholambulatorium": "alkoholambulatorie", # 1 + "ambulatorium": "ambulatorie", # 1 + "ananassene": "ananasserne", # 2 + "anførelsestegn": "anførselstegn", # 1 + "anseelig": "anselig", # 2 + "antioxydant": "antioxidant", # 1 + "artrig": "artsrig", # 1 + "auditorium": "auditorie", # 1 + "avocado": "avokado", # 2 + "bagerst": "bagest", # 2 + "bagstræv": "bagstræb", # 1 + "bagstræver": "bagstræber", # 1 + "bagstræverisk": "bagstræberisk", # 1 + "balde": "balle", # 2 + "barselorlov": "barselsorlov", # 1 + "barselvikar": "barselsvikar", # 1 + "baskien": "baskerlandet", # 1 + "bayrisk": "bayersk", # 1 + "bedstefader": "bedstefar", # 1 + "bedstemoder": "bedstemor", # 1 + "behefte": "behæfte", # 1 + "beheftelse": "behæftelse", # 1 + "bidragydende": "bidragsydende", # 1 + "bidragyder": "bidragsyder", # 1 + "billiondel": "billiontedel", # 1 + "blaseret": "blasert", # 1 + "bleskifte": "bleskift", # 1 + "blodbroder": "blodsbroder", # 2 + "blyantspidser": "blyantsspidser", # 2 + "boligministerium": "boligministerie", # 1 + "borhul": "borehul", # 1 + "broder": "bror", # 2 + "buldog": "bulldog", # 2 + "bådhus": "bådehus", # 1 + "børnepleje": "barnepleje", # 1 + "børneseng": "barneseng", # 1 + "børnestol": "barnestol", # 1 + "cairo": "kairo", # 1 + "cambodia": "cambodja", # 1 + "cambodianer": "cambodjaner", # 1 + "cambodiansk": "cambodjansk", # 1 + "camouflage": "kamuflage", # 2 + "campylobacter": "kampylobakter", # 1 + "centeret": "centret", # 2 + "chefskahyt": "chefkahyt", # 1 + "chefspost": "chefpost", # 1 + "chefssekretær": "chefsekretær", # 1 + "chefsstol": "chefstol", # 1 + "cirkulærskrivelse": "cirkulæreskrivelse", # 1 + "cognacsglas": "cognacglas", # 1 + "columnist": "kolumnist", # 1 + "cricket": "kricket", # 2 + "dagplejemoder": "dagplejemor", # 1 + "damaskesdug": "damaskdug", # 1 + "damp-barn": "dampbarn", # 1 + "delfinarium": "delfinarie", # 1 + "dentallaboratorium": "dentallaboratorie", # 1 + "diaramme": "diasramme", # 1 + "diaré": "diarré", # 1 + "dioxyd": "dioxid", # 1 + "dommedagsprædiken": "dommedagspræken", # 1 + "donut": "doughnut", # 2 + "driftmæssig": "driftsmæssig", # 1 + "driftsikker": "driftssikker", # 1 + "driftsikring": "driftssikring", # 1 + "drikkejogurt": "drikkeyoghurt", # 1 + "drivein": "drive-in", # 1 + "driveinbiograf": "drive-in-biograf", # 1 + "drøvel": "drøbel", # 1 + "dødskriterium": "dødskriterie", # 1 + "e-mail-adresse": "e-mailadresse", # 1 + "e-post-adresse": "e-postadresse", # 1 + "egypten": "ægypten", # 2 + "ekskommunicere": "ekskommunikere", # 1 + "eksperimentarium": "eksperimentarie", # 1 + "elsass": "Alsace", # 1 + "elsasser": "alsacer", # 1 + "elsassisk": "alsacisk", # 1 + "elvetal": "ellevetal", # 1 + "elvetiden": "ellevetiden", # 1 + "elveårig": "elleveårig", # 1 + "elveårs": "elleveårs", # 1 + "elveårsbarn": "elleveårsbarn", # 1 + "elvte": "ellevte", # 1 + "elvtedel": "ellevtedel", # 1 + "energiministerium": "energiministerie", # 1 + "erhvervsministerium": "erhvervsministerie", # 1 + "espaliere": "spaliere", # 2 + "evangelium": "evangelie", # 1 + "fagministerium": "fagministerie", # 1 + "fakse": "faxe", # 1 + "fangstkvota": "fangstkvote", # 1 + "fader": "far", # 2 + "farbroder": "farbror", # 1 + "farfader": "farfar", # 1 + "farmoder": "farmor", # 1 + "federal": "føderal", # 1 + "federalisering": "føderalisering", # 1 + "federalisme": "føderalisme", # 1 + "federalist": "føderalist", # 1 + "federalistisk": "føderalistisk", # 1 + "federation": "føderation", # 1 + "federativ": "føderativ", # 1 + "fejlbeheftet": "fejlbehæftet", # 1 + "femetagers": "femetages", # 2 + "femhundredekroneseddel": "femhundredkroneseddel", # 2 + "filmpremiere": "filmpræmiere", # 2 + "finansimperium": "finansimperie", # 1 + "finansministerium": "finansministerie", # 1 + "firehjulstræk": "firhjulstræk", # 2 + "fjernstudium": "fjernstudie", # 1 + "formalier": "formalia", # 1 + "formandsskift": "formandsskifte", # 1 + "fornemst": "fornemmest", # 2 + "fornuftparti": "fornuftsparti", # 1 + "fornuftstridig": "fornuftsstridig", # 1 + "fornuftvæsen": "fornuftsvæsen", # 1 + "fornuftægteskab": "fornuftsægteskab", # 1 + "forretningsministerium": "forretningsministerie", # 1 + "forskningsministerium": "forskningsministerie", # 1 + "forstudium": "forstudie", # 1 + "forsvarsministerium": "forsvarsministerie", # 1 + "frilægge": "fritlægge", # 1 + "frilæggelse": "fritlæggelse", # 1 + "frilægning": "fritlægning", # 1 + "fristille": "fritstille", # 1 + "fristilling": "fritstilling", # 1 + "fuldttegnet": "fuldtegnet", # 1 + "fødestedskriterium": "fødestedskriterie", # 1 + "fødevareministerium": "fødevareministerie", # 1 + "følesløs": "følelsesløs", # 1 + "følgeligt": "følgelig", # 1 + "førne": "førn", # 1 + "gearskift": "gearskifte", # 2 + "gladeligt": "gladelig", # 1 + "glosehefte": "glosehæfte", # 1 + "glædeløs": "glædesløs", # 1 + "gonoré": "gonorré", # 1 + "grangiveligt": "grangivelig", # 1 + "grundliggende": "grundlæggende", # 2 + "grønsag": "grøntsag", # 2 + "gudbenådet": "gudsbenådet", # 1 + "gudfader": "gudfar", # 1 + "gudmoder": "gudmor", # 1 + "gulvmop": "gulvmoppe", # 1 + "gymnasium": "gymnasie", # 1 + "hackning": "hacking", # 1 + "halvbroder": "halvbror", # 1 + "halvelvetiden": "halvellevetiden", # 1 + "handelsgymnasium": "handelsgymnasie", # 1 + "hefte": "hæfte", # 1 + "hefteklamme": "hæfteklamme", # 1 + "heftelse": "hæftelse", # 1 + "heftemaskine": "hæftemaskine", # 1 + "heftepistol": "hæftepistol", # 1 + "hefteplaster": "hæfteplaster", # 1 + "heftestraf": "hæftestraf", # 1 + "heftning": "hæftning", # 1 + "helbroder": "helbror", # 1 + "hjemmeklasse": "hjemklasse", # 1 + "hjulspin": "hjulspind", # 1 + "huggevåben": "hugvåben", # 1 + "hulmurisolering": "hulmursisolering", # 1 + "hurtiggående": "hurtigtgående", # 2 + "hurtigttørrende": "hurtigtørrende", # 2 + "husmoder": "husmor", # 1 + "hydroxyd": "hydroxid", # 1 + "håndmikser": "håndmixer", # 1 + "højtaler": "højttaler", # 2 + "hønemoder": "hønemor", # 1 + "ide": "idé", # 2 + "imperium": "imperie", # 1 + "imponerthed": "imponerethed", # 1 + "inbox": "indboks", # 2 + "indenrigsministerium": "indenrigsministerie", # 1 + "indhefte": "indhæfte", # 1 + "indheftning": "indhæftning", # 1 + "indicium": "indicie", # 1 + "indkassere": "inkassere", # 2 + "iota": "jota", # 1 + "jobskift": "jobskifte", # 1 + "jogurt": "yoghurt", # 1 + "jukeboks": "jukebox", # 1 + "justitsministerium": "justitsministerie", # 1 + "kalorifere": "kalorifer", # 1 + "kandidatstipendium": "kandidatstipendie", # 1 + "kannevas": "kanvas", # 1 + "kaperssauce": "kaperssovs", # 1 + "kigge": "kikke", # 2 + "kirkeministerium": "kirkeministerie", # 1 + "klapmydse": "klapmyds", # 1 + "klimakterium": "klimakterie", # 1 + "klogeligt": "klogelig", # 1 + "knivblad": "knivsblad", # 1 + "kollegaer": "kolleger", # 2 + "kollegium": "kollegie", # 1 + "kollegiehefte": "kollegiehæfte", # 1 + "kollokviumx": "kollokvium", # 1 + "kommissorium": "kommissorie", # 1 + "kompendium": "kompendie", # 1 + "komplicerthed": "komplicerethed", # 1 + "konfederation": "konføderation", # 1 + "konfedereret": "konfødereret", # 1 + "konferensstudium": "konferensstudie", # 1 + "konservatorium": "konservatorie", # 1 + "konsulere": "konsultere", # 1 + "kradsbørstig": "krasbørstig", # 2 + "kravsspecifikation": "kravspecifikation", # 1 + "krematorium": "krematorie", # 1 + "krep": "crepe", # 1 + "krepnylon": "crepenylon", # 1 + "kreppapir": "crepepapir", # 1 + "kricket": "cricket", # 2 + "kriterium": "kriterie", # 1 + "kroat": "kroater", # 2 + "kroki": "croquis", # 1 + "kronprinsepar": "kronprinspar", # 2 + "kropdoven": "kropsdoven", # 1 + "kroplus": "kropslus", # 1 + "krøllefedt": "krølfedt", # 1 + "kulturministerium": "kulturministerie", # 1 + "kuponhefte": "kuponhæfte", # 1 + "kvota": "kvote", # 1 + "kvotaordning": "kvoteordning", # 1 + "laboratorium": "laboratorie", # 1 + "laksfarve": "laksefarve", # 1 + "laksfarvet": "laksefarvet", # 1 + "laksrød": "lakserød", # 1 + "laksyngel": "lakseyngel", # 1 + "laksørred": "lakseørred", # 1 + "landbrugsministerium": "landbrugsministerie", # 1 + "landskampstemning": "landskampsstemning", # 1 + "langust": "languster", # 1 + "lappegrejer": "lappegrej", # 1 + "lavløn": "lavtløn", # 1 + "lillebroder": "lillebror", # 1 + "linear": "lineær", # 1 + "loftlampe": "loftslampe", # 2 + "log-in": "login", # 1 + "login": "log-in", # 2 + "lovmedholdig": "lovmedholdelig", # 1 + "ludder": "luder", # 2 + "lysholder": "lyseholder", # 1 + "lægeskifte": "lægeskift", # 1 + "lærvillig": "lærevillig", # 1 + "løgsauce": "løgsovs", # 1 + "madmoder": "madmor", # 1 + "majonæse": "mayonnaise", # 1 + "mareridtagtig": "mareridtsagtig", # 1 + "margen": "margin", # 2 + "martyrium": "martyrie", # 1 + "mellemstatlig": "mellemstatslig", # 1 + "menneskene": "menneskerne", # 2 + "metropolis": "metropol", # 1 + "miks": "mix", # 1 + "mikse": "mixe", # 1 + "miksepult": "mixerpult", # 1 + "mikser": "mixer", # 1 + "mikserpult": "mixerpult", # 1 + "mikslån": "mixlån", # 1 + "miksning": "mixning", # 1 + "miljøministerium": "miljøministerie", # 1 + "milliarddel": "milliardtedel", # 1 + "milliondel": "milliontedel", # 1 + "ministerium": "ministerie", # 1 + "mop": "moppe", # 1 + "moder": "mor", # 2 + "moratorium": "moratorie", # 1 + "morbroder": "morbror", # 1 + "morfader": "morfar", # 1 + "mormoder": "mormor", # 1 + "musikkonservatorium": "musikkonservatorie", # 1 + "muslingskal": "muslingeskal", # 1 + "mysterium": "mysterie", # 1 + "naturalieydelse": "naturalydelse", # 1 + "naturalieøkonomi": "naturaløkonomi", # 1 + "navnebroder": "navnebror", # 1 + "nerium": "nerie", # 1 + "nådeløs": "nådesløs", # 1 + "nærforestående": "nærtforestående", # 1 + "nærstående": "nærtstående", # 1 + "observatorium": "observatorie", # 1 + "oldefader": "oldefar", # 1 + "oldemoder": "oldemor", # 1 + "opgraduere": "opgradere", # 1 + "opgraduering": "opgradering", # 1 + "oratorium": "oratorie", # 1 + "overbookning": "overbooking", # 1 + "overpræsidium": "overpræsidie", # 1 + "overstatlig": "overstatslig", # 1 + "oxyd": "oxid", # 1 + "oxydere": "oxidere", # 1 + "oxydering": "oxidering", # 1 + "pakkenellike": "pakkenelliker", # 1 + "papirtynd": "papirstynd", # 1 + "pastoralseminarium": "pastoralseminarie", # 1 + "peanutsene": "peanuttene", # 2 + "penalhus": "pennalhus", # 2 + "pensakrav": "pensumkrav", # 1 + "pepperoni": "peperoni", # 1 + "peruaner": "peruvianer", # 1 + "petrole": "petrol", # 1 + "piltast": "piletast", # 1 + "piltaste": "piletast", # 1 + "planetarium": "planetarie", # 1 + "plasteret": "plastret", # 2 + "plastic": "plastik", # 2 + "play-off-kamp": "playoffkamp", # 1 + "plejefader": "plejefar", # 1 + "plejemoder": "plejemor", # 1 + "podium": "podie", # 2 + "praha": "prag", # 2 + "preciøs": "pretiøs", # 2 + "privilegium": "privilegie", # 1 + "progredere": "progrediere", # 1 + "præsidium": "præsidie", # 1 + "psykodelisk": "psykedelisk", # 1 + "pudsegrejer": "pudsegrej", # 1 + "referensgruppe": "referencegruppe", # 1 + "referensramme": "referenceramme", # 1 + "refugium": "refugie", # 1 + "registeret": "registret", # 2 + "remedium": "remedie", # 1 + "remiks": "remix", # 1 + "reservert": "reserveret", # 1 + "ressortministerium": "ressortministerie", # 1 + "ressource": "resurse", # 2 + "resætte": "resette", # 1 + "rettelig": "retteligt", # 1 + "rettetaste": "rettetast", # 1 + "returtaste": "returtast", # 1 + "risici": "risikoer", # 2 + "roll-on": "rollon", # 1 + "rollehefte": "rollehæfte", # 1 + "rostbøf": "roastbeef", # 1 + "rygsæksturist": "rygsækturist", # 1 + "rødstjært": "rødstjert", # 1 + "saddel": "sadel", # 2 + "samaritan": "samaritaner", # 2 + "sanatorium": "sanatorie", # 1 + "sauce": "sovs", # 1 + "scanning": "skanning", # 2 + "sceneskifte": "sceneskift", # 1 + "scilla": "skilla", # 1 + "sejflydende": "sejtflydende", # 1 + "selvstudium": "selvstudie", # 1 + "seminarium": "seminarie", # 1 + "sennepssauce": "sennepssovs ", # 1 + "servitutbeheftet": "servitutbehæftet", # 1 + "sit-in": "sitin", # 1 + "skatteministerium": "skatteministerie", # 1 + "skifer": "skiffer", # 2 + "skyldsfølelse": "skyldfølelse", # 1 + "skysauce": "skysovs", # 1 + "sladdertaske": "sladretaske", # 2 + "sladdervorn": "sladrevorn", # 2 + "slagsbroder": "slagsbror", # 1 + "slettetaste": "slettetast", # 1 + "smørsauce": "smørsovs", # 1 + "snitsel": "schnitzel", # 1 + "snobbeeffekt": "snobeffekt", # 2 + "socialministerium": "socialministerie", # 1 + "solarium": "solarie", # 1 + "soldebroder": "soldebror", # 1 + "spagetti": "spaghetti", # 1 + "spagettistrop": "spaghettistrop", # 1 + "spagettiwestern": "spaghettiwestern", # 1 + "spin-off": "spinoff", # 1 + "spinnefiskeri": "spindefiskeri", # 1 + "spolorm": "spoleorm", # 1 + "sproglaboratorium": "sproglaboratorie", # 1 + "spækbræt": "spækkebræt", # 2 + "stand-in": "standin", # 1 + "stand-up-comedy": "standupcomedy", # 1 + "stand-up-komiker": "standupkomiker", # 1 + "statsministerium": "statsministerie", # 1 + "stedbroder": "stedbror", # 1 + "stedfader": "stedfar", # 1 + "stedmoder": "stedmor", # 1 + "stilehefte": "stilehæfte", # 1 + "stipendium": "stipendie", # 1 + "stjært": "stjert", # 1 + "stjærthage": "stjerthage", # 1 + "storebroder": "storebror", # 1 + "stortå": "storetå", # 1 + "strabads": "strabadser", # 1 + "strømlinjet": "strømlinet", # 1 + "studium": "studie", # 1 + "stænkelap": "stænklap", # 1 + "sundhedsministerium": "sundhedsministerie", # 1 + "suppositorium": "suppositorie", # 1 + "svejts": "schweiz", # 1 + "svejtser": "schweizer", # 1 + "svejtserfranc": "schweizerfranc", # 1 + "svejtserost": "schweizerost", # 1 + "svejtsisk": "schweizisk", # 1 + "svigerfader": "svigerfar", # 1 + "svigermoder": "svigermor", # 1 + "svirebroder": "svirebror", # 1 + "symposium": "symposie", # 1 + "sælarium": "sælarie", # 1 + "søreme": "sørme", # 2 + "søterritorium": "søterritorie", # 1 + "t-bone-steak": "t-bonesteak", # 1 + "tabgivende": "tabsgivende", # 1 + "tabuere": "tabuisere", # 1 + "tabuering": "tabuisering", # 1 + "tackle": "takle", # 2 + "tackling": "takling", # 2 + "taifun": "tyfon", # 1 + "take-off": "takeoff", # 1 + "taknemlig": "taknemmelig", # 2 + "talehørelærer": "tale-høre-lærer", # 1 + "talehøreundervisning": "tale-høre-undervisning", # 1 + "tandstik": "tandstikker", # 1 + "tao": "dao", # 1 + "taoisme": "daoisme", # 1 + "taoist": "daoist", # 1 + "taoistisk": "daoistisk", # 1 + "taverne": "taverna", # 1 + "teateret": "teatret", # 2 + "tekno": "techno", # 1 + "temposkifte": "temposkift", # 1 + "terrarium": "terrarie", # 1 + "territorium": "territorie", # 1 + "tesis": "tese", # 1 + "tidsstudium": "tidsstudie", # 1 + "tipoldefader": "tipoldefar", # 1 + "tipoldemoder": "tipoldemor", # 1 + "tomatsauce": "tomatsovs", # 1 + "tonart": "toneart", # 1 + "trafikministerium": "trafikministerie", # 1 + "tredve": "tredive", # 1 + "tredver": "trediver", # 1 + "tredveårig": "trediveårig", # 1 + "tredveårs": "trediveårs", # 1 + "tredveårsfødselsdag": "trediveårsfødselsdag", # 1 + "tredvte": "tredivte", # 1 + "tredvtedel": "tredivtedel", # 1 + "troldunge": "troldeunge", # 1 + "trommestikke": "trommestik", # 1 + "trubadur": "troubadour", # 2 + "trøstepræmie": "trøstpræmie", # 2 + "tummerum": "trummerum", # 1 + "tumultuarisk": "tumultarisk", # 1 + "tunghørighed": "tunghørhed", # 1 + "tus": "tusch", # 2 + "tusind": "tusinde", # 2 + "tvillingbroder": "tvillingebror", # 1 + "tvillingbror": "tvillingebror", # 1 + "tvillingebroder": "tvillingebror", # 1 + "ubeheftet": "ubehæftet", # 1 + "udenrigsministerium": "udenrigsministerie", # 1 + "udhulning": "udhuling", # 1 + "udslaggivende": "udslagsgivende", # 1 + "udspekulert": "udspekuleret", # 1 + "udviklingsministerium": "udviklingsministerie", # 1 + "uforpligtigende": "uforpligtende", # 1 + "uheldvarslende": "uheldsvarslende", # 1 + "uimponerthed": "uimponerethed", # 1 + "undervisningsministerium": "undervisningsministerie", # 1 + "unægtelig": "unægteligt", # 1 + "urinale": "urinal", # 1 + "uvederheftig": "uvederhæftig", # 1 + "vabel": "vable", # 2 + "vadi": "wadi", # 1 + "vaklevorn": "vakkelvorn", # 1 + "vanadin": "vanadium", # 1 + "vaselin": "vaseline", # 1 + "vederheftig": "vederhæftig", # 1 + "vedhefte": "vedhæfte", # 1 + "velar": "velær", # 1 + "videndeling": "vidensdeling", # 2 + "vinkelanførelsestegn": "vinkelanførselstegn", # 1 + "vipstjært": "vipstjert", # 1 + "vismut": "bismut", # 1 + "visvas": "vissevasse", # 1 + "voksværk": "vokseværk", # 1 + "værtdyr": "værtsdyr", # 1 + "værtplante": "værtsplante", # 1 + "wienersnitsel": "wienerschnitzel", # 1 + "yderliggående": "yderligtgående", # 2 + "zombi": "zombie", # 1 + "ægbakke": "æggebakke", # 1 + "ægformet": "æggeformet", # 1 + "ægleder": "æggeleder", # 1 + "ækvilibrist": "ekvilibrist", # 2 + "æselsøre": "æseløre", # 1 + "øjehule": "øjenhule", # 1 + "øjelåg": "øjenlåg", # 1 + "øjeåbner": "øjenåbner", # 1 + "økonomiministerium": "økonomiministerie", # 1 + "ørenring": "ørering", # 2 + "øvehefte": "øvehæfte" # 1 +} + + +NORM_EXCEPTIONS = {} + +for string, norm in _exc.items(): + NORM_EXCEPTIONS[string] = norm + NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index c67c038bf..584ccf6f9 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,32 +1,134 @@ # encoding: utf8 +""" +Tokenizer Exceptions. +Source: https://forkortelse.dk/ and various others. +""" + from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT _exc = {} +# Abbreviations for weekdays "søn." (for "søndag") as well as "Tor." and "Tors." +# (for "torsdag") are left out because they are ambiguous. The same is the case +# for abbreviations "jul." and "Jul." ("juli"). for exc_data in [ - {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, - {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, - {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, - {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, - {ORTH: "Apr.", LEMMA: "april", NORM: "april"}, - {ORTH: "Maj.", LEMMA: "maj", NORM: "maj"}, - {ORTH: "Jun.", LEMMA: "juni", NORM: "juni"}, - {ORTH: "Jul.", LEMMA: "juli", NORM: "juli"}, - {ORTH: "Aug.", LEMMA: "august", NORM: "august"}, - {ORTH: "Sep.", LEMMA: "september", NORM: "september"}, - {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, - {ORTH: "Nov.", LEMMA: "november", NORM: "november"}, - {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: + {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, + {ORTH: "jan.", LEMMA: "januar"}, + {ORTH: "febr.", LEMMA: "februar"}, + {ORTH: "feb.", LEMMA: "februar"}, + {ORTH: "mar.", LEMMA: "marts"}, + {ORTH: "apr.", LEMMA: "april"}, + {ORTH: "jun.", LEMMA: "juni"}, + {ORTH: "aug.", LEMMA: "august"}, + {ORTH: "sept.", LEMMA: "september"}, + {ORTH: "sep.", LEMMA: "september"}, + {ORTH: "okt.", LEMMA: "oktober"}, + {ORTH: "nov.", LEMMA: "november"}, + {ORTH: "dec.", LEMMA: "december"}, + {ORTH: "man.", LEMMA: "mandag"}, + {ORTH: "tirs.", LEMMA: "tirsdag"}, + {ORTH: "ons.", LEMMA: "onsdag"}, + {ORTH: "tor.", LEMMA: "torsdag"}, + {ORTH: "tors.", LEMMA: "torsdag"}, + {ORTH: "fre.", LEMMA: "fredag"}, + {ORTH: "lør.", LEMMA: "lørdag"}, + {ORTH: "Jan.", LEMMA: "januar"}, + {ORTH: "Febr.", LEMMA: "februar"}, + {ORTH: "Feb.", LEMMA: "februar"}, + {ORTH: "Mar.", LEMMA: "marts"}, + {ORTH: "Apr.", LEMMA: "april"}, + {ORTH: "Jun.", LEMMA: "juni"}, + {ORTH: "Aug.", LEMMA: "august"}, + {ORTH: "Sept.", LEMMA: "september"}, + {ORTH: "Sep.", LEMMA: "september"}, + {ORTH: "Okt.", LEMMA: "oktober"}, + {ORTH: "Nov.", LEMMA: "november"}, + {ORTH: "Dec.", LEMMA: "december"}, + {ORTH: "Man.", LEMMA: "mandag"}, + {ORTH: "Tirs.", LEMMA: "tirsdag"}, + {ORTH: "Ons.", LEMMA: "onsdag"}, + {ORTH: "Fre.", LEMMA: "fredag"}, + {ORTH: "Lør.", LEMMA: "lørdag"}]: _exc[exc_data[ORTH]] = [exc_data] for orth in [ - "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", - "if.", "iflg.", "m.a.o.", "mht.", "min.", "osv.", "pga.", "resp.", "self.", - "t.o.m.", "vha.", ""]: + "A.D.", "A/S", "aarh.", "ac.", "adj.", "adr.", "adsk.", "adv.", "afb.", + "afd.", "afg.", "afk.", "afs.", "aht.", "alg.", "alk.", "alm.", "amer.", + "ang.", "ank.", "anl.", "anv.", "arb.", "arr.", "att.", "B.C.", "bd.", + "bdt.", "beg.", "begr.", "beh.", "bet.", "bev.", "bhk.", "bib.", + "bibl.", "bidr.", "bildl.", "bill.", "bio.", "biol.", "bk.", "BK.", + "bl.", "bl.a.", "borgm.", "bot.", "Boul.", "br.", "brolægn.", "bto.", + "bygn.", "ca.", "cand.", "Chr.", "d.", "d.d.", "d.m.", "d.s.", "d.s.s.", + "d.y.", "d.å.", "d.æ.", "da.", "dagl.", "dat.", "dav.", "def.", "dek.", + "dep.", "desl.", "diam.", "dir.", "disp.", "distr.", "div.", "dkr.", + "dl.", "do.", "dobb.", "Dr.", "dr.h.c", "Dronn.", "ds.", "dvs.", "e.b.", + "e.l.", "e.o.", "e.v.t.", "eftf.", "eftm.", "eg.", "egl.", "eks.", + "eksam.", "ekskl.", "eksp.", "ekspl.", "el.", "el.lign.", "emer.", + "endv.", "eng.", "enk.", "etc.", "etym.", "eur.", "evt.", "exam.", "f.", + "f.eks.", "f.m.", "f.n.", "f.o.", "f.o.m.", "f.s.v.", "f.t.", "f.v.t.", + "f.å.", "fa.", "fakt.", "fam.", "fem.", "ff.", "fg.", "fhv.", "fig.", + "filol.", "filos.", "fl.", "flg.", "fm.", "fmd.", "fol.", "forb.", + "foreg.", "foren.", "forf.", "fork.", "form.", "forr.", "fors.", + "forsk.", "forts.", "fr.", "fr.u.", "frk.", "fsva.", "fuldm.", "fung.", + "fx.", "fys.", "fær.", "g.d.", "g.m.", "gd.", "gdr.", "genuds.", "gl.", + "gn.", "gns.", "gr.", "grdl.", "gross.", "h.a.", "h.c.", "H.K.H.", + "H.M.", "hdl.", "henv.", "Hf.", "hhv.", "hj.hj.", "hj.spl.", "hort.", + "hosp.", "hpl.", "Hr.", "hr.", "hrs.", "hum.", "hvp.", "i/s", "I/S", + "i.e.", "ib.", "id.", "if.", "iflg.", "ifm.", "ift.", "iht.", "ill.", + "indb.", "indreg.", "inf.", "ing.", "inh.", "inj.", "inkl.", "insp.", + "instr.", "isl.", "istf.", "it.", "ital.", "iv.", "jap.", "jf.", "jfr.", + "jnr.", "j.nr.", "jr.", "jur.", "jvf.", "K.", "kap.", "kat.", "kbh.", + "kem.", "kgl.", "kl.", "kld.", "knsp.", "komm.", "kons.", "korr.", + "kp.", "Kprs.", "kr.", "kst.", "kt.", "ktr.", "kv.", "kvt.", "l.", + "L.A.", "l.c.", "lab.", "lat.", "lb.m.", "lb.nr.", "lejl.", "lgd.", + "lic.", "lign.", "lin.", "ling.merc.", "litt.", "Ll.", "loc.cit.", + "lok.", "lrs.", "ltr.", "m/s", "M/S", "m.a.o.", "m.fl.", "m.m.", "m.v.", + "m.v.h.", "Mag.", "maks.", "md.", "mdr.", "mdtl.", "mezz.", "mfl.", + "m.h.p.", "m.h.t", "mht.", "mik.", "min.", "mio.", "modt.", "Mr.", + "mrk.", "mul.", "mv.", "n.br.", "n.f.", "nat.", "nb.", "Ndr.", + "nedenst.", "nl.", "nr.", "Nr.", "nto.", "nuv.", "o/m", "o.a.", "o.fl.", + "o.h.", "o.l.", "o.lign.", "o.m.a.", "o.s.fr.", "obl.", "obs.", + "odont.", "oecon.", "off.", "ofl.", "omg.", "omkr.", "omr.", "omtr.", + "opg.", "opl.", "opr.", "org.", "orig.", "osv.", "ovenst.", "overs.", + "ovf.", "p.", "p.a.", "p.b.a", "p.b.v", "p.c.", "p.m.", "p.m.v.", + "p.n.", "p.p.", "p.p.s.", "p.s.", "p.t.", "p.v.a.", "p.v.c.", "pag.", + "par.", "Pas.", "pass.", "pcs.", "pct.", "pd.", "pens.", "pers.", + "pft.", "pg.", "pga.", "pgl.", "Ph.d.", "pinx.", "pk.", "pkt.", + "polit.", "polyt.", "pos.", "pp.", "ppm.", "pr.", "prc.", "priv.", + "prod.", "prof.", "pron.", "Prs.", "præd.", "præf.", "præt.", "psych.", + "pt.", "pæd.", "q.e.d.", "rad.", "Rcp.", "red.", "ref.", "reg.", + "regn.", "rel.", "rep.", "repr.", "resp.", "rest.", "rm.", "rtg.", + "russ.", "s.", "s.br.", "s.d.", "s.f.", "s.m.b.a.", "s.u.", "s.å.", + "sa.", "sb.", "sc.", "scient.", "scil.", "Sdr.", "sek.", "sekr.", + "self.", "sem.", "sen.", "shj.", "sign.", "sing.", "sj.", "skr.", + "Skt.", "slutn.", "sml.", "smp.", "sms.", "snr.", "soc.", "soc.dem.", + "sort.", "sp.", "spec.", "Spl.", "spm.", "spr.", "spsk.", "statsaut.", + "st.", "stk.", "str.", "stud.", "subj.", "subst.", "suff.", "sup.", + "suppl.", "sv.", "såk.", "sædv.", "sø.", "t/r", "t.", "t.h.", "t.o.", + "t.o.m.", "t.v.", "tab.", "tbl.", "tcp/ip", "td.", "tdl.", "tdr.", + "techn.", "tekn.", "temp.", "th.", "theol.", "ti.", "tidl.", "tilf.", + "tilh.", "till.", "tilsv.", "tjg.", "tkr.", "tlf.", "tlgr.", "to.", + "tr.", "trp.", "tsk.", "tv.", "ty.", "u/b", "udb.", "udbet.", "ugtl.", + "undt.", "v.", "v.f.", "var.", "vb.", "vedk.", "vedl.", "vedr.", + "vejl.", "Vg.", "vh.", "vha.", "vs.", "vsa.", "vær.", "zool.", "ø.lgd.", + "øv.", "øvr.", "årg.", "årh.", ""]: _exc[orth] = [{ORTH: orth}] +# Dates +for h in range(1, 31 + 1): + for period in ["."]: + _exc["%d%s" % (h, period)] = [ + {ORTH: "%d." % h}] + +_custom_base_exc = { + "i.": [ + {ORTH: "i", LEMMA: "i", NORM: "i"}, + {ORTH: ".", TAG: PUNCT}] +} +_exc.update(_custom_base_exc) + TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py new file mode 100644 index 000000000..6a1290728 --- /dev/null +++ b/spacy/lang/nl/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.nl.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +examples = [ + "Apple overweegt om voor 1 miljard een U.K. startup te kopen", + "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten", + "San Francisco overweegt robots op voetpaden te verbieden", + "Londen is een grote stad in het Verenigd Koninkrijk" +] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py new file mode 100644 index 000000000..898b48348 --- /dev/null +++ b/spacy/lang/ru/__init__.py @@ -0,0 +1,38 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .norm_exceptions import NORM_EXCEPTIONS +from .lex_attrs import LEX_ATTRS +from .tag_map import TAG_MAP +from .lemmatizer import RussianLemmatizer + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...util import update_exc, add_lookups +from ...language import Language +from ...attrs import LANG, LIKE_NUM, NORM + + +class RussianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: 'ru' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], + BASE_NORMS, NORM_EXCEPTIONS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + tag_map = TAG_MAP + + @classmethod + def create_lemmatizer(cls, nlp=None): + return RussianLemmatizer() + + +class Russian(Language): + lang = 'ru' + Defaults = RussianDefaults + + +__all__ = ['Russian'] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py new file mode 100644 index 000000000..8ea6255d7 --- /dev/null +++ b/spacy/lang/ru/lemmatizer.py @@ -0,0 +1,237 @@ +# coding: utf8 +from ...symbols import ( + ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS +) +from ...lemmatizer import Lemmatizer + + +class RussianLemmatizer(Lemmatizer): + _morph = None + + def __init__(self): + super(RussianLemmatizer, self).__init__() + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + 'The Russian lemmatizer requires the pymorphy2 library: ' + 'try to fix it with "pip install pymorphy2==0.8"') + + if RussianLemmatizer._morph is None: + RussianLemmatizer._morph = MorphAnalyzer() + + def __call__(self, string, univ_pos, morphology=None): + univ_pos = self.normalize_univ_pos(univ_pos) + if univ_pos == 'PUNCT': + return [PUNCT_RULES.get(string, string)] + + if univ_pos not in ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'): + # Skip unchangeable pos + return [string.lower()] + + analyses = self._morph.parse(string) + filtered_analyses = [] + for analysis in analyses: + if not analysis.is_known: + # Skip suggested parse variant for unknown word for pymorphy + continue + analysis_pos, _ = oc2ud(str(analysis.tag)) + if analysis_pos == univ_pos \ + or (analysis_pos in ('NOUN', 'PROPN') and univ_pos in ('NOUN', 'PROPN')): + filtered_analyses.append(analysis) + + if not len(filtered_analyses): + return [string.lower()] + if morphology is None or (len(morphology) == 1 and POS in morphology): + return list(set([analysis.normal_form for analysis in filtered_analyses])) + + if univ_pos in ('ADJ', 'DET', 'NOUN', 'PROPN'): + features_to_compare = ['Case', 'Number', 'Gender'] + elif univ_pos == 'NUM': + features_to_compare = ['Case', 'Gender'] + elif univ_pos == 'PRON': + features_to_compare = ['Case', 'Number', 'Gender', 'Person'] + else: # VERB + features_to_compare = ['Aspect', 'Gender', 'Mood', 'Number', 'Tense', 'VerbForm', 'Voice'] + + analyses, filtered_analyses = filtered_analyses, [] + for analysis in analyses: + _, analysis_morph = oc2ud(str(analysis.tag)) + for feature in features_to_compare: + if (feature in morphology and feature in analysis_morph + and morphology[feature] != analysis_morph[feature]): + break + else: + filtered_analyses.append(analysis) + + if not len(filtered_analyses): + return [string.lower()] + return list(set([analysis.normal_form for analysis in filtered_analyses])) + + @staticmethod + def normalize_univ_pos(univ_pos): + if isinstance(univ_pos, str): + return univ_pos.upper() + + symbols_to_str = { + ADJ: 'ADJ', + DET: 'DET', + NOUN: 'NOUN', + NUM: 'NUM', + PRON: 'PRON', + PROPN: 'PROPN', + PUNCT: 'PUNCT', + VERB: 'VERB' + } + if univ_pos in symbols_to_str: + return symbols_to_str[univ_pos] + return None + + def is_base_form(self, univ_pos, morphology=None): + # TODO + raise NotImplementedError + + def det(self, string, morphology=None): + return self(string, 'det', morphology) + + def num(self, string, morphology=None): + return self(string, 'num', morphology) + + def pron(self, string, morphology=None): + return self(string, 'pron', morphology) + + def lookup(self, string): + analyses = self._morph.parse(string) + if len(analyses) == 1: + return analyses[0].normal_form + return string + + +def oc2ud(oc_tag): + gram_map = { + '_POS': { + 'ADJF': 'ADJ', + 'ADJS': 'ADJ', + 'ADVB': 'ADV', + 'Apro': 'DET', + 'COMP': 'ADJ', # Can also be an ADV - unchangeable + 'CONJ': 'CCONJ', # Can also be a SCONJ - both unchangeable ones + 'GRND': 'VERB', + 'INFN': 'VERB', + 'INTJ': 'INTJ', + 'NOUN': 'NOUN', + 'NPRO': 'PRON', + 'NUMR': 'NUM', + 'NUMB': 'NUM', + 'PNCT': 'PUNCT', + 'PRCL': 'PART', + 'PREP': 'ADP', + 'PRTF': 'VERB', + 'PRTS': 'VERB', + 'VERB': 'VERB', + }, + 'Animacy': { + 'anim': 'Anim', + 'inan': 'Inan', + }, + 'Aspect': { + 'impf': 'Imp', + 'perf': 'Perf', + }, + 'Case': { + 'ablt': 'Ins', + 'accs': 'Acc', + 'datv': 'Dat', + 'gen1': 'Gen', + 'gen2': 'Gen', + 'gent': 'Gen', + 'loc2': 'Loc', + 'loct': 'Loc', + 'nomn': 'Nom', + 'voct': 'Voc', + }, + 'Degree': { + 'COMP': 'Cmp', + 'Supr': 'Sup', + }, + 'Gender': { + 'femn': 'Fem', + 'masc': 'Masc', + 'neut': 'Neut', + }, + 'Mood': { + 'impr': 'Imp', + 'indc': 'Ind', + }, + 'Number': { + 'plur': 'Plur', + 'sing': 'Sing', + }, + 'NumForm': { + 'NUMB': 'Digit', + }, + 'Person': { + '1per': '1', + '2per': '2', + '3per': '3', + 'excl': '2', + 'incl': '1', + }, + 'Tense': { + 'futr': 'Fut', + 'past': 'Past', + 'pres': 'Pres', + }, + 'Variant': { + 'ADJS': 'Brev', + 'PRTS': 'Brev', + }, + 'VerbForm': { + 'GRND': 'Conv', + 'INFN': 'Inf', + 'PRTF': 'Part', + 'PRTS': 'Part', + 'VERB': 'Fin', + }, + 'Voice': { + 'actv': 'Act', + 'pssv': 'Pass', + }, + 'Abbr': { + 'Abbr': 'Yes' + } + } + + pos = 'X' + morphology = dict() + unmatched = set() + + grams = oc_tag.replace(' ', ',').split(',') + for gram in grams: + match = False + for categ, gmap in sorted(gram_map.items()): + if gram in gmap: + match = True + if categ == '_POS': + pos = gmap[gram] + else: + morphology[categ] = gmap[gram] + if not match: + unmatched.add(gram) + + while len(unmatched) > 0: + gram = unmatched.pop() + if gram in ('Name', 'Patr', 'Surn', 'Geox', 'Orgn'): + pos = 'PROPN' + elif gram == 'Auxt': + pos = 'AUX' + elif gram == 'Pltm': + morphology['Number'] = 'Ptan' + + return pos, morphology + + +PUNCT_RULES = { + "«": "\"", + "»": "\"" +} diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py new file mode 100644 index 000000000..e44525743 --- /dev/null +++ b/spacy/lang/ru/lex_attrs.py @@ -0,0 +1,35 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + 'ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять', + + 'десять', 'одиннадцать', 'двенадцать', 'тринадцать', 'четырнадцать', + 'пятнадцать', 'шестнадцать', 'семнадцать', 'восемнадцать', 'девятнадцать', + + 'двадцать', 'тридцать', 'сорок', 'пятьдесят', 'шестьдесят', 'семьдесят', 'восемьдесят', 'девяносто', + + 'сто', 'двести', 'триста', 'четыреста', 'пятьсот', 'шестьсот', 'семьсот', 'восемьсот', 'девятьсот', + + 'тысяча', 'миллион', 'миллиард', 'триллион', 'квадриллион', 'квинтиллион'] + + +def like_num(text): + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = { + LIKE_NUM: like_num +} diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py new file mode 100644 index 000000000..53f48a8d8 --- /dev/null +++ b/spacy/lang/ru/norm_exceptions.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + + +_exc = { + # Slang + 'прив': 'привет', + 'ща': 'сейчас', + 'спс': 'спасибо', + 'пжлст': 'пожалуйста', + 'плиз': 'пожалуйста', + 'лан': 'ладно', + 'ясн': 'ясно', + 'всм': 'всмысле', + 'хош': 'хочешь', + 'оч': 'очень' +} + + +NORM_EXCEPTIONS = {} + +for string, norm in _exc.items(): + NORM_EXCEPTIONS[string] = norm + NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py new file mode 100644 index 000000000..ddb28af86 --- /dev/null +++ b/spacy/lang/ru/stop_words.py @@ -0,0 +1,54 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +а + +будем будет будете будешь буду будут будучи будь будьте бы был была были было +быть + +в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею +всея всю вся вы + +да для до + +его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею + +же + +за + +и из или им ими имъ их + +к как кем ко когда кого ком кому комья которая которого которое которой котором +которому которою которую которые который которым которыми которых кто + +меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего +моей моем моём моему моею можем может можете можешь мои мой моим моими моих +мочь мою моя мы + +на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим +нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но + +о об один одна одни одним одними одних одно одного одной одном одному одною +одну он она оне они оно от + +по при + +с сам сама сами самим самими самих само самого самом самому саму свое своё +своего своей своем своём своему своею свои свой своим своими своих свою своя +себе себя собой собою + +та так такая такие таким такими таких такого такое такой таком такому такою +такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому +тот тою ту ты + +у уже + +чего чем чём чему что чтобы + +эта эти этим этими этих это этого этой этом этому этот этою эту + +я +""".split()) \ No newline at end of file diff --git a/spacy/lang/ru/tag_map.py b/spacy/lang/ru/tag_map.py new file mode 100644 index 000000000..1369a9dbf --- /dev/null +++ b/spacy/lang/ru/tag_map.py @@ -0,0 +1,731 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ( + POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ, CCONJ +) + +TAG_MAP = { + 'ADJ__Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Animacy': 'Anim', 'Case': 'Acc', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Animacy=Anim|Case=Acc|Degree=Pos|Number=Plur': {POS: ADJ, 'Animacy': 'Anim', 'Case': 'Acc', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Animacy': 'Anim', 'Case': 'Acc', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Animacy=Anim|Case=Nom|Degree=Pos|Number=Plur': {POS: ADJ, 'Animacy': 'Anim', 'Case': 'Nom', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Acc', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Animacy=Inan|Case=Acc|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Acc', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Animacy=Inan|Case=Acc|Degree=Pos|Number=Plur': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Acc', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Acc', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Animacy=Inan|Case=Acc|Degree=Sup|Number=Plur': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Acc', 'Degree': 'Sup', 'Number': 'Plur'}, + 'ADJ__Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Animacy=Inan|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Animacy': 'Inan', 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Acc|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Acc', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Acc|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Acc', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Acc|Degree=Sup|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Acc', 'Degree': 'Sup', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Acc|Degree=Sup|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Acc', 'Degree': 'Sup', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Dat|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Dat|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Dat|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Dat|Degree=Pos|Number=Plur': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Case=Dat|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Dat|Degree=Sup|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Sup', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Dat|Degree=Sup|Number=Plur': {POS: ADJ, 'Case': 'Dat', 'Degree': 'Sup', 'Number': 'Plur'}, + 'ADJ__Case=Gen|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|Variant=Short': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing', 'Variant': 'Short'}, + 'ADJ__Case=Gen|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Gen|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Gen|Degree=Pos|Number=Plur': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Case=Gen|Degree=Sup|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Sup', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Gen|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Gen|Degree=Sup|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Sup', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Gen|Degree=Sup|Number=Plur': {POS: ADJ, 'Case': 'Gen', 'Degree': 'Sup', 'Number': 'Plur'}, + 'ADJ__Case=Ins|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Ins|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Ins|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Ins|Degree=Pos|Number=Plur': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Case=Ins|Degree=Sup|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Sup', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Ins|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Ins|Degree=Sup|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Sup', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Ins|Degree=Sup|Number=Plur': {POS: ADJ, 'Case': 'Ins', 'Degree': 'Sup', 'Number': 'Plur'}, + 'ADJ__Case=Loc|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Loc|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Loc|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Loc|Degree=Pos|Number=Plur': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Case=Loc|Degree=Sup|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Sup', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Loc|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Loc|Degree=Sup|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Sup', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Loc|Degree=Sup|Number=Plur': {POS: ADJ, 'Case': 'Loc', 'Degree': 'Sup', 'Number': 'Plur'}, + 'ADJ__Case=Nom|Degree=Pos|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Nom|Degree=Pos|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Nom|Degree=Pos|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Nom|Degree=Pos|Number=Plur': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Pos', 'Number': 'Plur'}, + 'ADJ__Case=Nom|Degree=Sup|Gender=Fem|Number=Sing': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Sup', 'Gender': 'Fem', 'Number': 'Sing'}, + 'ADJ__Case=Nom|Degree=Sup|Gender=Masc|Number=Sing': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Sup', 'Gender': 'Masc', 'Number': 'Sing'}, + 'ADJ__Case=Nom|Degree=Sup|Gender=Neut|Number=Sing': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Sup', 'Gender': 'Neut', 'Number': 'Sing'}, + 'ADJ__Case=Nom|Degree=Sup|Number=Plur': {POS: ADJ, 'Case': 'Nom', 'Degree': 'Sup', 'Number': 'Plur'}, + 'ADJ__Degree=Cmp': {POS: ADJ, 'Degree': 'Cmp'}, + 'ADJ__Degree=Pos': {POS: ADJ, 'Degree': 'Pos'}, + 'ADJ__Degree=Pos|Gender=Fem|Number=Sing|Variant=Short': {POS: ADJ, 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing', 'Variant': 'Short'}, + 'ADJ__Degree=Pos|Gender=Masc|Number=Sing|Variant=Short': {POS: ADJ, 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing', 'Variant': 'Short'}, + 'ADJ__Degree=Pos|Gender=Neut|Number=Sing|Variant=Short': {POS: ADJ, 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing', 'Variant': 'Short'}, + 'ADJ__Degree=Pos|Number=Plur|Variant=Short': {POS: ADJ, 'Degree': 'Pos', 'Number': 'Plur', 'Variant': 'Short'}, + 'ADJ__Foreign=Yes': {POS: ADJ, 'Foreign': 'Yes'}, + 'ADJ___': {POS: ADJ}, + 'ADP___': {POS: ADP}, + 'ADV__Degree=Cmp': {POS: ADV, 'Degree': 'Cmp'}, + 'ADV__Degree=Pos': {POS: ADV, 'Degree': 'Pos'}, + 'ADV__Polarity=Neg': {POS: ADV, 'Polarity': 'Neg'}, + 'AUX__Aspect=Imp|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Gender': 'Neut', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Imp', 'Number': 'Plur', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Imp|Number=Sing|Person=2|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Imp', 'Number': 'Sing', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|Tense=Pres|VerbForm=Conv|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'Tense': 'Pres', 'VerbForm': 'Conv', 'Voice': 'Act'}, + 'AUX__Aspect=Imp|VerbForm=Inf|Voice=Act': {POS: AUX, 'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': 'Act'}, + 'CCONJ___': {POS: CCONJ}, + 'DET__Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing': {POS: DET, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing': {POS: DET, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing': {POS: DET, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Animacy=Inan|Case=Gen|Number=Plur': {POS: DET, 'Animacy': 'Inan', 'Case': 'Gen', 'Number': 'Plur'}, + 'DET__Case=Acc|Degree=Pos|Number=Plur': {POS: DET, 'Case': 'Acc', 'Degree': 'Pos', 'Number': 'Plur'}, + 'DET__Case=Acc|Gender=Fem|Number=Sing': {POS: DET, 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Case=Acc|Gender=Masc|Number=Sing': {POS: DET, 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Case=Acc|Gender=Neut|Number=Sing': {POS: DET, 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Case=Acc|Number=Plur': {POS: DET, 'Case': 'Acc', 'Number': 'Plur'}, + 'DET__Case=Dat|Gender=Fem|Number=Sing': {POS: DET, 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Case=Dat|Gender=Masc|Number=Plur': {POS: DET, 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}, + 'DET__Case=Dat|Gender=Masc|Number=Sing': {POS: DET, 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Case=Dat|Gender=Neut|Number=Sing': {POS: DET, 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Case=Dat|Number=Plur': {POS: DET, 'Case': 'Dat', 'Number': 'Plur'}, + 'DET__Case=Gen|Gender=Fem|Number=Sing': {POS: DET, 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Case=Gen|Gender=Masc|Number=Sing': {POS: DET, 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Case=Gen|Gender=Neut|Number=Sing': {POS: DET, 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Case=Gen|Number=Plur': {POS: DET, 'Case': 'Gen', 'Number': 'Plur'}, + 'DET__Case=Ins|Gender=Fem|Number=Sing': {POS: DET, 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Case=Ins|Gender=Masc|Number=Sing': {POS: DET, 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Case=Ins|Gender=Neut|Number=Sing': {POS: DET, 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Case=Ins|Number=Plur': {POS: DET, 'Case': 'Ins', 'Number': 'Plur'}, + 'DET__Case=Loc|Gender=Fem|Number=Sing': {POS: DET, 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Case=Loc|Gender=Masc|Number=Sing': {POS: DET, 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Case=Loc|Gender=Neut|Number=Sing': {POS: DET, 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Case=Loc|Number=Plur': {POS: DET, 'Case': 'Loc', 'Number': 'Plur'}, + 'DET__Case=Nom|Gender=Fem|Number=Sing': {POS: DET, 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}, + 'DET__Case=Nom|Gender=Masc|Number=Plur': {POS: DET, 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Plur'}, + 'DET__Case=Nom|Gender=Masc|Number=Sing': {POS: DET, 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, + 'DET__Case=Nom|Gender=Neut|Number=Sing': {POS: DET, 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing'}, + 'DET__Case=Nom|Number=Plur': {POS: DET, 'Case': 'Nom', 'Number': 'Plur'}, + 'DET__Gender=Masc|Number=Sing': {POS: DET, 'Gender': 'Masc', 'Number': 'Sing'}, + 'INTJ___': {POS: INTJ}, + 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Acc|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Acc|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Acc|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Acc|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Acc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Acc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Dat|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Dat|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Dat|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Dat|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Dat|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Dat|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Dat', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Gen|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Gen|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Gen|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Gen|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Gen|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Gen', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Ins|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Ins|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Ins|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Ins|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Ins|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Ins|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Ins', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Loc|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Loc|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Loc|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Loc|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Loc|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Loc|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Loc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Loc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Nom|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Nom|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Anim|Case=Nom|Number=Plur': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Nom', 'Number': 'Plur'}, + 'NOUN__Animacy=Anim|Case=Voc|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Anim', 'Case': 'Voc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Acc|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Acc|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Acc|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Acc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Acc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Dat|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Dat|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Dat|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Dat|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Dat|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Dat|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Dat', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Gen|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Gen|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Gen', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Ins|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Ins|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Ins|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Ins|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Ins|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Ins', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Loc|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Loc|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Loc|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Loc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Loc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Nom|Gender=Fem|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Nom|Gender=Masc|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Case=Nom|Number=Plur': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Nom', 'Number': 'Plur'}, + 'NOUN__Animacy=Inan|Case=Par|Gender=Masc|Number=Sing': {POS: NOUN, 'Animacy': 'Inan', 'Case': 'Par', 'Gender': 'Masc', 'Number': 'Sing'}, + 'NOUN__Animacy=Inan|Gender=Fem': {POS: NOUN, 'Animacy': 'Inan', 'Gender': 'Fem'}, + 'NOUN__Animacy=Inan|Gender=Masc': {POS: NOUN, 'Animacy': 'Inan', 'Gender': 'Masc'}, + 'NOUN__Animacy=Inan|Gender=Neut': {POS: NOUN, 'Animacy': 'Inan', 'Gender': 'Neut'}, + 'NOUN__Case=Gen|Degree=Pos|Gender=Fem|Number=Sing': {POS: NOUN, 'Case': 'Gen', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'NOUN__Foreign=Yes': {POS: NOUN, 'Foreign': 'Yes'}, + 'NOUN___': {POS: NOUN}, + 'NUM__Animacy=Anim|Case=Acc': {POS: NUM, 'Animacy': 'Anim', 'Case': 'Acc'}, + 'NUM__Animacy=Anim|Case=Acc|Gender=Fem': {POS: NUM, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Fem'}, + 'NUM__Animacy=Anim|Case=Acc|Gender=Masc': {POS: NUM, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Masc'}, + 'NUM__Animacy=Inan|Case=Acc': {POS: NUM, 'Animacy': 'Inan', 'Case': 'Acc'}, + 'NUM__Animacy=Inan|Case=Acc|Gender=Fem': {POS: NUM, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem'}, + 'NUM__Animacy=Inan|Case=Acc|Gender=Masc': {POS: NUM, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc'}, + 'NUM__Case=Acc': {POS: NUM, 'Case': 'Acc'}, + 'NUM__Case=Acc|Gender=Fem': {POS: NUM, 'Case': 'Acc', 'Gender': 'Fem'}, + 'NUM__Case=Acc|Gender=Masc': {POS: NUM, 'Case': 'Acc', 'Gender': 'Masc'}, + 'NUM__Case=Acc|Gender=Neut': {POS: NUM, 'Case': 'Acc', 'Gender': 'Neut'}, + 'NUM__Case=Dat': {POS: NUM, 'Case': 'Dat'}, + 'NUM__Case=Dat|Gender=Fem': {POS: NUM, 'Case': 'Dat', 'Gender': 'Fem'}, + 'NUM__Case=Dat|Gender=Masc': {POS: NUM, 'Case': 'Dat', 'Gender': 'Masc'}, + 'NUM__Case=Dat|Gender=Neut': {POS: NUM, 'Case': 'Dat', 'Gender': 'Neut'}, + 'NUM__Case=Gen': {POS: NUM, 'Case': 'Gen'}, + 'NUM__Case=Gen|Gender=Fem': {POS: NUM, 'Case': 'Gen', 'Gender': 'Fem'}, + 'NUM__Case=Gen|Gender=Masc': {POS: NUM, 'Case': 'Gen', 'Gender': 'Masc'}, + 'NUM__Case=Gen|Gender=Neut': {POS: NUM, 'Case': 'Gen', 'Gender': 'Neut'}, + 'NUM__Case=Ins': {POS: NUM, 'Case': 'Ins'}, + 'NUM__Case=Ins|Gender=Fem': {POS: NUM, 'Case': 'Ins', 'Gender': 'Fem'}, + 'NUM__Case=Ins|Gender=Masc': {POS: NUM, 'Case': 'Ins', 'Gender': 'Masc'}, + 'NUM__Case=Ins|Gender=Neut': {POS: NUM, 'Case': 'Ins', 'Gender': 'Neut'}, + 'NUM__Case=Loc': {POS: NUM, 'Case': 'Loc'}, + 'NUM__Case=Loc|Gender=Fem': {POS: NUM, 'Case': 'Loc', 'Gender': 'Fem'}, + 'NUM__Case=Loc|Gender=Masc': {POS: NUM, 'Case': 'Loc', 'Gender': 'Masc'}, + 'NUM__Case=Loc|Gender=Neut': {POS: NUM, 'Case': 'Loc', 'Gender': 'Neut'}, + 'NUM__Case=Nom': {POS: NUM, 'Case': 'Nom'}, + 'NUM__Case=Nom|Gender=Fem': {POS: NUM, 'Case': 'Nom', 'Gender': 'Fem'}, + 'NUM__Case=Nom|Gender=Masc': {POS: NUM, 'Case': 'Nom', 'Gender': 'Masc'}, + 'NUM__Case=Nom|Gender=Neut': {POS: NUM, 'Case': 'Nom', 'Gender': 'Neut'}, + 'NUM___': {POS: NUM}, + 'PART__Mood=Cnd': {POS: PART, 'Mood': 'Cnd'}, + 'PART__Polarity=Neg': {POS: PART, 'Polarity': 'Neg'}, + 'PART___': {POS: PART}, + 'PRON__Animacy=Anim|Case=Acc|Gender=Masc|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Acc|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Acc', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PRON__Animacy=Anim|Case=Dat|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Dat', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Gen|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Gen', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PRON__Animacy=Anim|Case=Ins|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Ins', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Loc|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Loc', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Case=Nom|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Case': 'Nom', 'Number': 'Plur'}, + 'PRON__Animacy=Anim|Gender=Masc|Number=Plur': {POS: PRON, 'Animacy': 'Anim', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PRON__Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Ins|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Animacy=Inan|Gender=Neut|Number=Sing': {POS: PRON, 'Animacy': 'Inan', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PRON__Case=Acc': {POS: PRON, 'Case': 'Acc'}, + 'PRON__Case=Acc|Gender=Fem|Number=Sing|Person=3': {POS: PRON, 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Acc|Gender=Masc|Number=Sing|Person=3': {POS: PRON, 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Acc|Gender=Neut|Number=Sing|Person=3': {POS: PRON, 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Acc|Number=Plur|Person=1': {POS: PRON, 'Case': 'Acc', 'Number': 'Plur', 'Person': '1'}, + 'PRON__Case=Acc|Number=Plur|Person=2': {POS: PRON, 'Case': 'Acc', 'Number': 'Plur', 'Person': '2'}, + 'PRON__Case=Acc|Number=Plur|Person=3': {POS: PRON, 'Case': 'Acc', 'Number': 'Plur', 'Person': '3'}, + 'PRON__Case=Acc|Number=Sing|Person=1': {POS: PRON, 'Case': 'Acc', 'Number': 'Sing', 'Person': '1'}, + 'PRON__Case=Acc|Number=Sing|Person=2': {POS: PRON, 'Case': 'Acc', 'Number': 'Sing', 'Person': '2'}, + 'PRON__Case=Dat': {POS: PRON, 'Case': 'Dat'}, + 'PRON__Case=Dat|Gender=Fem|Number=Sing|Person=3': {POS: PRON, 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Dat|Gender=Masc|Number=Sing|Person=3': {POS: PRON, 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Dat|Gender=Neut|Number=Sing|Person=3': {POS: PRON, 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Dat|Number=Plur|Person=1': {POS: PRON, 'Case': 'Dat', 'Number': 'Plur', 'Person': '1'}, + 'PRON__Case=Dat|Number=Plur|Person=2': {POS: PRON, 'Case': 'Dat', 'Number': 'Plur', 'Person': '2'}, + 'PRON__Case=Dat|Number=Plur|Person=3': {POS: PRON, 'Case': 'Dat', 'Number': 'Plur', 'Person': '3'}, + 'PRON__Case=Dat|Number=Sing|Person=1': {POS: PRON, 'Case': 'Dat', 'Number': 'Sing', 'Person': '1'}, + 'PRON__Case=Dat|Number=Sing|Person=2': {POS: PRON, 'Case': 'Dat', 'Number': 'Sing', 'Person': '2'}, + 'PRON__Case=Gen': {POS: PRON, 'Case': 'Gen'}, + 'PRON__Case=Gen|Gender=Fem|Number=Sing|Person=3': {POS: PRON, 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Gen|Gender=Masc|Number=Sing|Person=3': {POS: PRON, 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Gen|Gender=Neut|Number=Sing|Person=3': {POS: PRON, 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Gen|Number=Plur|Person=1': {POS: PRON, 'Case': 'Gen', 'Number': 'Plur', 'Person': '1'}, + 'PRON__Case=Gen|Number=Plur|Person=2': {POS: PRON, 'Case': 'Gen', 'Number': 'Plur', 'Person': '2'}, + 'PRON__Case=Gen|Number=Plur|Person=3': {POS: PRON, 'Case': 'Gen', 'Number': 'Plur', 'Person': '3'}, + 'PRON__Case=Gen|Number=Sing|Person=1': {POS: PRON, 'Case': 'Gen', 'Number': 'Sing', 'Person': '1'}, + 'PRON__Case=Gen|Number=Sing|Person=2': {POS: PRON, 'Case': 'Gen', 'Number': 'Sing', 'Person': '2'}, + 'PRON__Case=Ins': {POS: PRON, 'Case': 'Ins'}, + 'PRON__Case=Ins|Gender=Fem|Number=Sing|Person=3': {POS: PRON, 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Ins|Gender=Masc|Number=Sing|Person=3': {POS: PRON, 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Ins|Gender=Neut|Number=Sing|Person=3': {POS: PRON, 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Ins|Number=Plur|Person=1': {POS: PRON, 'Case': 'Ins', 'Number': 'Plur', 'Person': '1'}, + 'PRON__Case=Ins|Number=Plur|Person=2': {POS: PRON, 'Case': 'Ins', 'Number': 'Plur', 'Person': '2'}, + 'PRON__Case=Ins|Number=Plur|Person=3': {POS: PRON, 'Case': 'Ins', 'Number': 'Plur', 'Person': '3'}, + 'PRON__Case=Ins|Number=Sing|Person=1': {POS: PRON, 'Case': 'Ins', 'Number': 'Sing', 'Person': '1'}, + 'PRON__Case=Ins|Number=Sing|Person=2': {POS: PRON, 'Case': 'Ins', 'Number': 'Sing', 'Person': '2'}, + 'PRON__Case=Loc': {POS: PRON, 'Case': 'Loc'}, + 'PRON__Case=Loc|Gender=Fem|Number=Sing|Person=3': {POS: PRON, 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Loc|Gender=Masc|Number=Sing|Person=3': {POS: PRON, 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Loc|Gender=Neut|Number=Sing|Person=3': {POS: PRON, 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Loc|Number=Plur|Person=1': {POS: PRON, 'Case': 'Loc', 'Number': 'Plur', 'Person': '1'}, + 'PRON__Case=Loc|Number=Plur|Person=2': {POS: PRON, 'Case': 'Loc', 'Number': 'Plur', 'Person': '2'}, + 'PRON__Case=Loc|Number=Plur|Person=3': {POS: PRON, 'Case': 'Loc', 'Number': 'Plur', 'Person': '3'}, + 'PRON__Case=Loc|Number=Sing|Person=1': {POS: PRON, 'Case': 'Loc', 'Number': 'Sing', 'Person': '1'}, + 'PRON__Case=Loc|Number=Sing|Person=2': {POS: PRON, 'Case': 'Loc', 'Number': 'Sing', 'Person': '2'}, + 'PRON__Case=Nom': {POS: PRON, 'Case': 'Nom'}, + 'PRON__Case=Nom|Gender=Fem|Number=Sing|Person=3': {POS: PRON, 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Nom|Gender=Masc|Number=Sing|Person=3': {POS: PRON, 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Nom|Gender=Neut|Number=Sing|Person=3': {POS: PRON, 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'}, + 'PRON__Case=Nom|Number=Plur|Person=1': {POS: PRON, 'Case': 'Nom', 'Number': 'Plur', 'Person': '1'}, + 'PRON__Case=Nom|Number=Plur|Person=2': {POS: PRON, 'Case': 'Nom', 'Number': 'Plur', 'Person': '2'}, + 'PRON__Case=Nom|Number=Plur|Person=3': {POS: PRON, 'Case': 'Nom', 'Number': 'Plur', 'Person': '3'}, + 'PRON__Case=Nom|Number=Sing|Person=1': {POS: PRON, 'Case': 'Nom', 'Number': 'Sing', 'Person': '1'}, + 'PRON__Case=Nom|Number=Sing|Person=2': {POS: PRON, 'Case': 'Nom', 'Number': 'Sing', 'Person': '2'}, + 'PRON__Number=Sing|Person=1': {POS: PRON, 'Number': 'Sing', 'Person': '1'}, + 'PRON___': {POS: PRON}, + 'PROPN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Acc|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Acc|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Acc|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Dat|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Dat|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Dat|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Dat|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Gen|Foreign=Yes|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Gen', 'Foreign': 'Yes', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Gen|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Gen|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Ins|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Ins|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Ins|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Loc|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Loc|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Loc|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Nom|Foreign=Yes|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Nom', 'Foreign': 'Yes', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Case=Nom|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Anim|Case=Voc|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Case': 'Voc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Anim|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Anim', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Acc|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Acc|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Acc|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Acc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Acc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Dat|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Dat|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Dat|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Dat|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Dat|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Dat|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Dat', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Gen|Foreign=Yes|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Foreign': 'Yes', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Gen|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Gen|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Gen', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Ins|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Ins|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Ins|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Ins|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Ins|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Ins', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Loc|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Loc|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Loc|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Loc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Loc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Nom|Foreign=Yes|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Foreign': 'Yes', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Nom|Foreign=Yes|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Foreign': 'Yes', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Nom|Foreign=Yes|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Foreign': 'Yes', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Nom|Gender=Fem|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Nom|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Case=Nom|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Nom', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Case=Par|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Case': 'Par', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Gender=Fem': {POS: PROPN, 'Animacy': 'Inan', 'Gender': 'Fem'}, + 'PROPN__Animacy=Inan|Gender=Masc': {POS: PROPN, 'Animacy': 'Inan', 'Gender': 'Masc'}, + 'PROPN__Animacy=Inan|Gender=Masc|Number=Plur': {POS: PROPN, 'Animacy': 'Inan', 'Gender': 'Masc', 'Number': 'Plur'}, + 'PROPN__Animacy=Inan|Gender=Masc|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Animacy=Inan|Gender=Neut|Number=Sing': {POS: PROPN, 'Animacy': 'Inan', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Case=Acc|Degree=Pos|Gender=Fem|Number=Sing': {POS: PROPN, 'Case': 'Acc', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Case=Dat|Degree=Pos|Gender=Masc|Number=Sing': {POS: PROPN, 'Case': 'Dat', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Case=Ins|Degree=Pos|Gender=Fem|Number=Sing': {POS: PROPN, 'Case': 'Ins', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Case=Ins|Degree=Pos|Number=Plur': {POS: PROPN, 'Case': 'Ins', 'Degree': 'Pos', 'Number': 'Plur'}, + 'PROPN__Case=Nom|Degree=Pos|Gender=Fem|Number=Sing': {POS: PROPN, 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, + 'PROPN__Case=Nom|Degree=Pos|Gender=Masc|Number=Sing': {POS: PROPN, 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}, + 'PROPN__Case=Nom|Degree=Pos|Gender=Neut|Number=Sing': {POS: PROPN, 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}, + 'PROPN__Case=Nom|Degree=Pos|Number=Plur': {POS: PROPN, 'Case': 'Nom', 'Degree': 'Pos', 'Number': 'Plur'}, + 'PROPN__Degree=Pos|Gender=Neut|Number=Sing|Variant=Short': {POS: PROPN, 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing', 'Variant': 'Short'}, + 'PROPN__Degree=Pos|Number=Plur|Variant=Short': {POS: PROPN, 'Degree': 'Pos', 'Number': 'Plur', 'Variant': 'Short'}, + 'PROPN__Foreign=Yes': {POS: PROPN, 'Foreign': 'Yes'}, + 'PROPN__Number=Sing': {POS: PROPN, 'Number': 'Sing'}, + 'PROPN___': {POS: PROPN}, + 'PUNCT___': {POS: PUNCT}, + 'SCONJ__Mood=Cnd': {POS: SCONJ, 'Mood': 'Cnd'}, + 'SCONJ___': {POS: SCONJ}, + 'SYM___': {POS: SYM}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Anim|Aspect=Imp|Case=Acc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Anim|Aspect=Perf|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Anim|Aspect=Perf|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Anim|Aspect=Perf|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Anim|Aspect=Perf|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Perf', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Anim|Aspect=Perf|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Perf', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Anim|Aspect=Perf|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Anim', 'Aspect': 'Perf', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Inan|Aspect=Imp|Case=Acc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Imp', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Inan|Aspect=Perf|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Inan|Aspect=Perf|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Inan|Aspect=Perf|Case=Acc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Animacy=Inan|Aspect=Perf|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Perf', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Animacy=Inan|Aspect=Perf|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Perf', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Animacy=Inan|Aspect=Perf|Case=Acc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Animacy': 'Inan', 'Aspect': 'Perf', 'Case': 'Acc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Acc|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Dat|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Dat|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Dat|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Dat|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Gen|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Gen|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Gen|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Ins|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Ins|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Ins|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Ins|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Ins|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Loc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Loc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Loc|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Masc|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Gender=Neut|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Case=Nom|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Case=Nom|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Case=Nom|Number=Plur|Tense=Pres|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Pres', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Fem|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Fem|Number=Sing|Tense=Pres|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Pres', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Masc|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Masc|Number=Sing|Tense=Pres|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Pres', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Neut', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Neut', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Neut', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Neut|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Gender=Neut|Number=Sing|Tense=Pres|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Pres', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Imp', 'Number': 'Plur', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Imp', 'Number': 'Plur', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Imp|Number=Sing|Person=2|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Imp', 'Number': 'Sing', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Imp|Number=Sing|Person=2|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Imp', 'Number': 'Sing', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Number=Plur|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Number': 'Plur', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Number=Plur|Tense=Pres|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Number': 'Plur', 'Tense': 'Pres', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|Tense=Past|VerbForm=Conv|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Tense': 'Past', 'VerbForm': 'Conv', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Tense=Pres|VerbForm=Conv|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'Tense': 'Pres', 'VerbForm': 'Conv', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|Tense=Pres|VerbForm=Conv|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'Tense': 'Pres', 'VerbForm': 'Conv', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|Tense=Pres|VerbForm=Conv|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'Tense': 'Pres', 'VerbForm': 'Conv', 'Voice': 'Pass'}, + 'VERB__Aspect=Imp|VerbForm=Inf|Voice=Act': {POS: VERB, 'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': 'Act'}, + 'VERB__Aspect=Imp|VerbForm=Inf|Voice=Mid': {POS: VERB, 'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': 'Mid'}, + 'VERB__Aspect=Imp|VerbForm=Inf|Voice=Pass': {POS: VERB, 'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Acc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Acc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Acc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Acc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Acc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Acc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Dat|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Dat|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Dat|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Dat|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Dat', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Gen|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Gen|Number=Plur|Tense=Fut|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Fut', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Gen', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Ins|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Ins|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Ins|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Ins|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Ins', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Loc|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Loc', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Nom|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Case': 'Nom', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Fem', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Gender=Fem|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Gender=Masc|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Masc', 'Number': 'Sing', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Neut', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Neut', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Gender=Neut|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Gender': 'Neut', 'Number': 'Sing', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Mood=Imp|Number=Plur|Person=1|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Plur', 'Person': '1', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Plur', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Imp|Number=Plur|Person=2|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Plur', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Imp|Number=Sing|Person=2|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Sing', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Imp|Number=Sing|Person=2|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Sing', 'Person': '2', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Tense=Fut|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '1', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Tense=Fut|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '1', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Tense=Fut|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '2', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Tense=Fut|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '2', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Person': '3', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Plur', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Tense=Fut|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Tense=Fut|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Tense=Fut|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '2', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Tense=Fut|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '2', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Fut', 'VerbForm': 'Fin', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|Number=Plur|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass': {POS: VERB, 'Aspect': 'Perf', 'Number': 'Plur', 'Tense': 'Past', 'Variant': 'Short', 'VerbForm': 'Part', 'Voice': 'Pass'}, + 'VERB__Aspect=Perf|Tense=Past|VerbForm=Conv|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'Tense': 'Past', 'VerbForm': 'Conv', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|Tense=Past|VerbForm=Conv|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'Tense': 'Past', 'VerbForm': 'Conv', 'Voice': 'Mid'}, + 'VERB__Aspect=Perf|VerbForm=Inf|Voice=Act': {POS: VERB, 'Aspect': 'Perf', 'VerbForm': 'Inf', 'Voice': 'Act'}, + 'VERB__Aspect=Perf|VerbForm=Inf|Voice=Mid': {POS: VERB, 'Aspect': 'Perf', 'VerbForm': 'Inf', 'Voice': 'Mid'}, + 'VERB__Voice=Act': {POS: VERB, 'Voice': 'Act'}, + 'VERB___': {POS: VERB}, + 'X__Foreign=Yes': {POS: X, 'Foreign': 'Yes'}, + 'X___': {POS: X}, +} diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py new file mode 100644 index 000000000..b9a8e1e6e --- /dev/null +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -0,0 +1,68 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM + + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "пн", LEMMA: "понедельник", NORM: "понедельник"}, + {ORTH: "вт", LEMMA: "вторник", NORM: "вторник"}, + {ORTH: "ср", LEMMA: "среда", NORM: "среда"}, + {ORTH: "чт", LEMMA: "четверг", NORM: "четверг"}, + {ORTH: "чтв", LEMMA: "четверг", NORM: "четверг"}, + {ORTH: "пт", LEMMA: "пятница", NORM: "пятница"}, + {ORTH: "сб", LEMMA: "суббота", NORM: "суббота"}, + {ORTH: "сбт", LEMMA: "суббота", NORM: "суббота"}, + {ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"}, + {ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"}, + {ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"}, + + # Months abbreviations + {ORTH: "янв", LEMMA: "январь", NORM: "январь"}, + {ORTH: "фев", LEMMA: "февраль", NORM: "февраль"}, + {ORTH: "февр", LEMMA: "февраль", NORM: "февраль"}, + {ORTH: "мар", LEMMA: "март", NORM: "март"}, + # {ORTH: "март", LEMMA: "март", NORM: "март"}, + {ORTH: "мрт", LEMMA: "март", NORM: "март"}, + {ORTH: "апр", LEMMA: "апрель", NORM: "апрель"}, + # {ORTH: "май", LEMMA: "май", NORM: "май"}, + {ORTH: "июн", LEMMA: "июнь", NORM: "июнь"}, + # {ORTH: "июнь", LEMMA: "июнь", NORM: "июнь"}, + {ORTH: "июл", LEMMA: "июль", NORM: "июль"}, + # {ORTH: "июль", LEMMA: "июль", NORM: "июль"}, + {ORTH: "авг", LEMMA: "август", NORM: "август"}, + {ORTH: "сен", LEMMA: "сентябрь", NORM: "сентябрь"}, + {ORTH: "сент", LEMMA: "сентябрь", NORM: "сентябрь"}, + {ORTH: "окт", LEMMA: "октябрь", NORM: "октябрь"}, + {ORTH: "октб", LEMMA: "октябрь", NORM: "октябрь"}, + {ORTH: "ноя", LEMMA: "ноябрь", NORM: "ноябрь"}, + {ORTH: "нояб", LEMMA: "ноябрь", NORM: "ноябрь"}, + {ORTH: "нбр", LEMMA: "ноябрь", NORM: "ноябрь"}, + {ORTH: "дек", LEMMA: "декабрь", NORM: "декабрь"}, +] + + +for abbrev_desc in _abbrev_exc: + abbrev = abbrev_desc[ORTH] + for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): + _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] + _exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] + + +_slang_exc = [ + {ORTH: '2к15', LEMMA: '2015', NORM: '2015'}, + {ORTH: '2к16', LEMMA: '2016', NORM: '2016'}, + {ORTH: '2к17', LEMMA: '2017', NORM: '2017'}, + {ORTH: '2к18', LEMMA: '2018', NORM: '2018'}, + {ORTH: '2к19', LEMMA: '2019', NORM: '2019'}, + {ORTH: '2к20', LEMMA: '2020', NORM: '2020'}, +] + +for slang_desc in _slang_exc: + _exc[slang_desc[ORTH]] = [slang_desc] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/language.py b/spacy/language.py index cacce85c7..ec0c5d68f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -147,7 +147,7 @@ class Language(object): self._meta.setdefault('lang', self.vocab.lang) self._meta.setdefault('name', 'model') self._meta.setdefault('version', '0.0.0') - self._meta.setdefault('spacy_version', about.__version__) + self._meta.setdefault('spacy_version', '>={}'.format(about.__version__)) self._meta.setdefault('description', '') self._meta.setdefault('author', '') self._meta.setdefault('email', '') @@ -260,7 +260,7 @@ class Language(object): elif before and before in self.pipe_names: self.pipeline.insert(self.pipe_names.index(before), pipe) elif after and after in self.pipe_names: - self.pipeline.insert(self.pipe_names.index(after), pipe) + self.pipeline.insert(self.pipe_names.index(after) + 1, pipe) else: msg = "Can't find '{}' in pipeline. Available names: {}" unfound = before or after @@ -502,19 +502,19 @@ class Language(object): pass def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, - disable=[]): + disable=[], cleanup=False): """Process texts as a stream, and yield `Doc` objects in order. - Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. - n_threads (int): The number of worker threads to use. If -1, OpenMP - will decide how many to use at run time. Default is 2. + n_threads (int): Currently inactive. batch_size (int): The number of texts to buffer. disable (list): Names of the pipeline components to disable. + cleanup (bool): If True, unneeded strings are freed, + to control memory use. Experimental. YIELDS (Doc): Documents in the order of the original text. EXAMPLE: @@ -547,24 +547,27 @@ class Language(object): # in the string store. recent_refs = weakref.WeakSet() old_refs = weakref.WeakSet() - # If there is anything that we have inside — after iterations we should - # carefully get it back. - original_strings_data = list(self.vocab.strings) + # Keep track of the original string data, so that if we flush old strings, + # we can recover the original ones. However, we only want to do this if we're + # really adding strings, to save up-front costs. + original_strings_data = None nr_seen = 0 for doc in docs: yield doc - recent_refs.add(doc) - if nr_seen < 10000: - old_refs.add(doc) - nr_seen += 1 - elif len(old_refs) == 0: - self.vocab.strings._cleanup_stale_strings() - nr_seen = 0 - # We can't know which strings from the last batch have really expired. - # So we don't erase the strings — we just extend with the original - # content. - for string in original_strings_data: - self.vocab.strings.add(string) + if cleanup: + recent_refs.add(doc) + if nr_seen < 10000: + old_refs.add(doc) + nr_seen += 1 + elif len(old_refs) == 0: + old_refs, recent_refs = recent_refs, old_refs + if original_strings_data is None: + original_strings_data = list(self.vocab.strings) + else: + keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) + self.vocab._reset_cache(keys, strings) + self.tokenizer._reset_cache(keys) + nr_seen = 0 def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pyx b/spacy/strings.pyx index f4e047118..649bd43a4 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -249,20 +249,37 @@ cdef class StringStore: for string in strings: self.add(string) - def _cleanup_stale_strings(self): + def _cleanup_stale_strings(self, excepted): + """ + excepted (list): Strings that should not be removed. + RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places + """ if self.hits.size() == 0: # If we don't have any hits, just skip cleanup return cdef vector[hash_t] tmp + dropped_strings = [] + dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] - if self.hits.count(key) != 0: + # Here we cannot use __getitem__ because it also set hit. + utf8str = self._map.get(key) + value = decode_Utf8Str(utf8str) + if self.hits.count(key) != 0 or value in excepted: tmp.push_back(key) + else: + dropped_keys.append(key) + dropped_strings.append(value) self.keys.swap(tmp) + strings = list(self) + self._reset_and_load(strings) + # Here we have strings but hits to it should be reseted self.hits.clear() + return dropped_keys, dropped_strings + cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 404f1bc90..cace1a832 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,4 +1,6 @@ # coding: utf-8 +# cython: profile=True +# cython: infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. @@ -7,6 +9,8 @@ from __future__ import unicode_literals from copy import copy +from ..tokens.doc cimport Doc + DELIMITER = '||' @@ -111,17 +115,18 @@ def projectivize(heads, labels): return proj_heads, deco_labels -def deprojectivize(tokens): +cpdef deprojectivize(Doc doc): # Reattach arcs with decorated labels (following HEAD scheme). For each # decorated arc X||Y, search top-down, left-to-right, breadth-first until # hitting a Y then make this the new head. - for token in tokens: - if is_decorated(token.dep_): - newlabel, headlabel = decompose(token.dep_) - newhead = _find_new_head(token, headlabel) - token.head = newhead - token.dep_ = newlabel - return tokens + for i in range(doc.length): + label = doc.vocab.strings[doc.c[i].dep] + if DELIMITER in label: + new_label, head_label = label.split(DELIMITER) + new_head = _find_new_head(doc[i], head_label) + doc[i].head = new_head + doc.c[i].dep = doc.vocab.strings.add(new_label) + return doc def _decorate(heads, proj_heads, labels): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 2d1b03514..001d6f292 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -15,7 +15,7 @@ from .. import util # here if it's using spaCy's tokenizer (not a different library) # TODO: re-implement generic tokenizer tests _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', - 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] + 'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'xx'] _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_core_news_sm'], @@ -40,6 +40,12 @@ def FR(request): return load_test_model(request.param) +@pytest.fixture() +def RU(request): + pymorphy = pytest.importorskip('pymorphy2') + return util.get_lang_class('ru')() + + #@pytest.fixture(params=_languages) #def tokenizer(request): #lang = util.get_lang_class(request.param) @@ -137,6 +143,12 @@ def th_tokenizer(): return util.get_lang_class('th').Defaults.create_tokenizer() +@pytest.fixture +def ru_tokenizer(): + pymorphy = pytest.importorskip('pymorphy2') + return util.get_lang_class('ru').Defaults.create_tokenizer() + + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/gold/test_biluo.py b/spacy/tests/gold/test_biluo.py index a1aa91cf0..b89dd46b8 100644 --- a/spacy/tests/gold/test_biluo.py +++ b/spacy/tests/gold/test_biluo.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ...gold import biluo_tags_from_offsets +from ...gold import biluo_tags_from_offsets, offsets_from_biluo_tags from ...tokens.doc import Doc import pytest @@ -41,3 +41,14 @@ def test_gold_biluo_misalign(en_vocab): entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')] tags = biluo_tags_from_offsets(doc, entities) assert tags == ['O', 'O', 'O', '-', '-', '-'] + + +def test_roundtrip_offsets_biluo_conversion(en_tokenizer): + text = "I flew to Silicon Valley via London." + biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O'] + offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')] + doc = en_tokenizer(text) + biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) + assert biluo_tags_converted == biluo_tags + offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) + assert offsets_converted == offsets diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index d89fafd2c..e3ad6fcb4 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -3,13 +3,37 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec."]) +@pytest.mark.parametrize('text', + ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."]) def test_da_tokenizer_handles_abbr(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 +@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."]) +def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 2 + +@pytest.mark.parametrize('text', ["1.", "10.", "31."]) +def test_da_tokenizer_handles_dates(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + def test_da_tokenizer_handles_exc_in_text(da_tokenizer): text = "Det er bl.a. ikke meningen" tokens = da_tokenizer(text) assert len(tokens) == 5 assert tokens[2].text == "bl.a." + +def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): + text = "Her er noget du kan kigge i." + tokens = da_tokenizer(text) + assert len(tokens) == 8 + assert tokens[6].text == "i" + assert tokens[7].text == "." + +@pytest.mark.parametrize('text,norm', + [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]) +def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): + tokens = da_tokenizer(text) + assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/fr/test_lemmatization.py b/spacy/tests/lang/fr/test_lemmatization.py index bcd8d4600..15da5b7d4 100644 --- a/spacy/tests/lang/fr/test_lemmatization.py +++ b/spacy/tests/lang/fr/test_lemmatization.py @@ -21,7 +21,7 @@ def test_lemmatizer_noun_verb_2(FR): @pytest.mark.models('fr') @pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN") -def test_lemmatizer_noun(model): +def test_lemmatizer_noun(FR): tokens = FR("il y a des Costaricienne.") assert tokens[4].lemma_ == "Costaricain" diff --git a/spacy/tests/lang/ru/__init__.py b/spacy/tests/lang/ru/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py new file mode 100644 index 000000000..2d9dd8b85 --- /dev/null +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from ....tokens.doc import Doc + + +@pytest.fixture +def ru_lemmatizer(RU): + return RU.Defaults.create_lemmatizer() + + +@pytest.mark.models('ru') +def test_doc_lemmatization(RU): + doc = Doc(RU.vocab, words=['мама', 'мыла', 'раму']) + doc[0].tag_ = 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing' + doc[1].tag_ = 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act' + doc[2].tag_ = 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing' + + lemmas = [token.lemma_ for token in doc] + assert lemmas == ['мама', 'мыть', 'рама'] + + +@pytest.mark.models('ru') +@pytest.mark.parametrize('text,lemmas', [('гвоздики', ['гвоздик', 'гвоздика']), + ('люди', ['человек']), + ('реки', ['река']), + ('кольцо', ['кольцо']), + ('пепперони', ['пепперони'])]) +def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas): + assert sorted(ru_lemmatizer.noun(text)) == lemmas + + +@pytest.mark.models('ru') +@pytest.mark.parametrize('text,pos,morphology,lemma', [('рой', 'NOUN', None, 'рой'), + ('рой', 'VERB', None, 'рыть'), + ('клей', 'NOUN', None, 'клей'), + ('клей', 'VERB', None, 'клеить'), + ('три', 'NUM', None, 'три'), + ('кос', 'NOUN', {'Number': 'Sing'}, 'кос'), + ('кос', 'NOUN', {'Number': 'Plur'}, 'коса'), + ('кос', 'ADJ', None, 'косой'), + ('потом', 'NOUN', None, 'пот'), + ('потом', 'ADV', None, 'потом') + ]) +def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma): + assert ru_lemmatizer(text, pos, morphology) == [lemma] + + +@pytest.mark.models('ru') +@pytest.mark.parametrize('text,morphology,lemma', [('гвоздики', {'Gender': 'Fem'}, 'гвоздика'), + ('гвоздики', {'Gender': 'Masc'}, 'гвоздик'), + ('вина', {'Gender': 'Fem'}, 'вина'), + ('вина', {'Gender': 'Neut'}, 'вино') + ]) +def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma): + assert ru_lemmatizer.noun(text, morphology) == [lemma] + + +@pytest.mark.models('ru') +def test_ru_lemmatizer_punct(ru_lemmatizer): + assert ru_lemmatizer.punct('«') == ['"'] + assert ru_lemmatizer.punct('»') == ['"'] + + +# @pytest.mark.models('ru') +# def test_ru_lemmatizer_lemma_assignment(RU): +# text = "А роза упала на лапу Азора." +# doc = RU.make_doc(text) +# RU.tagger(doc) +# assert all(t.lemma_ != '' for t in doc) diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py new file mode 100644 index 000000000..1c4d55d2d --- /dev/null +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -0,0 +1,128 @@ +# coding: utf-8 +"""Test that open, closed and paired punctuation is split off correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +PUNCT_OPEN = ['(', '[', '{', '*'] +PUNCT_CLOSE = [')', ']', '}', '*'] +PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] + + +@pytest.mark.parametrize('text', ["(", "((", "<"]) +def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert len(tokens) == len(text) + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(punct + text) + assert len(tokens) == 2 + assert tokens[0].text == punct + assert tokens[1].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(text + punct) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('punct_add', ["`"]) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text): + tokens = ru_tokenizer(punct + punct_add + text) + assert len(tokens) == 3 + assert tokens[0].text == punct + assert tokens[1].text == punct_add + assert tokens[2].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('punct_add', ["'"]) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text): + tokens = ru_tokenizer(text + punct + punct_add) + assert len(tokens) == 3 + assert tokens[0].text == text + assert tokens[1].text == punct + assert tokens[2].text == punct_add + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(punct + punct + punct + text) + assert len(tokens) == 4 + assert tokens[0].text == punct + assert tokens[3].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(text + punct + punct + punct) + assert len(tokens) == 4 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('text', ["'Тест"]) +def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == "'" + + +@pytest.mark.parametrize('text', ["Тест''"]) +def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert len(tokens) == 2 + tokens_punct = ru_tokenizer("''") + assert len(tokens_punct) == 1 + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('text', ["Тест"]) +def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open, + punct_close, text): + tokens = ru_tokenizer(punct_open + text + punct_close) + assert len(tokens) == 3 + assert tokens[0].text == punct_open + assert tokens[1].text == text + assert tokens[2].text == punct_close + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) +@pytest.mark.parametrize('text', ["Тест"]) +def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close, + punct_open2, punct_close2, text): + tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) + assert len(tokens) == 5 + assert tokens[0].text == punct_open2 + assert tokens[1].text == punct_open + assert tokens[2].text == text + assert tokens[3].text == punct_close + assert tokens[4].text == punct_close2 + + +@pytest.mark.parametrize('text', ["Тест."]) +def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert tokens[1].text == "." + + +def test_ru_tokenizer_splits_bracket_period(ru_tokenizer): + text = "(Раз, два, три, проверка)." + tokens = ru_tokenizer(text) + assert tokens[len(tokens) - 1].text == "." diff --git a/spacy/tests/lang/ru/test_tokenizer_exc.py b/spacy/tests/lang/ru/test_tokenizer_exc.py new file mode 100644 index 000000000..554036537 --- /dev/null +++ b/spacy/tests/lang/ru/test_tokenizer_exc.py @@ -0,0 +1,16 @@ +# coding: utf-8 +"""Test that tokenizer exceptions are parsed correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text,norms', [("пн.", ["понедельник"]), + ("пт.", ["пятница"]), + ("дек.", ["декабрь"])]) +def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms): + tokens = ru_tokenizer(text) + assert len(tokens) == 1 + assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index dbcde3e5e..c0165d004 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -100,3 +100,10 @@ def test_disable_pipes_context(nlp, name): with nlp.disable_pipes(name): assert not nlp.has_pipe(name) assert nlp.has_pipe(name) + + +@pytest.mark.parametrize('n_pipes', [100]) +def test_add_lots_of_pipes(nlp, n_pipes): + for i in range(n_pipes): + nlp.add_pipe(lambda doc: doc, name='pipe_%d' % i) + assert len(nlp.pipe_names) == n_pipes diff --git a/spacy/tests/regression/test_issue1207.py b/spacy/tests/regression/test_issue1207.py new file mode 100644 index 000000000..f8a53e05c --- /dev/null +++ b/spacy/tests/regression/test_issue1207.py @@ -0,0 +1,13 @@ +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.models('en') +def test_issue1207(EN): + text = 'Employees are recruiting talented staffers from overseas.' + doc = EN(text) + + assert [i.text for i in doc.noun_chunks] == ['Employees', 'talented staffers'] + sent = list(doc.sents)[0] + assert [i.text for i in sent.noun_chunks] == ['Employees', 'talented staffers'] diff --git a/spacy/tests/regression/test_issue1494.py b/spacy/tests/regression/test_issue1494.py new file mode 100644 index 000000000..693e81e81 --- /dev/null +++ b/spacy/tests/regression/test_issue1494.py @@ -0,0 +1,39 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +import re + +from ...lang.en import English +from ...tokenizer import Tokenizer + + +def test_issue1494(): + infix_re = re.compile(r'''[^a-z]''') + text_to_tokenize1 = 'token 123test' + expected_tokens1 = ['token', '1', '2', '3', 'test'] + + text_to_tokenize2 = 'token 1test' + expected_tokens2 = ['token', '1test'] + + text_to_tokenize3 = 'hello...test' + expected_tokens3 = ['hello', '.', '.', '.', 'test'] + + def my_tokenizer(nlp): + return Tokenizer(nlp.vocab, + {}, + infix_finditer=infix_re.finditer + ) + + nlp = English() + + nlp.tokenizer = my_tokenizer(nlp) + + tokenized_words1 = [token.text for token in nlp(text_to_tokenize1)] + assert tokenized_words1 == expected_tokens1 + + tokenized_words2 = [token.text for token in nlp(text_to_tokenize2)] + assert tokenized_words2 == expected_tokens2 + + tokenized_words3 = [token.text for token in nlp(text_to_tokenize3)] + assert tokenized_words3 == expected_tokens3 diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index d9ba1ac97..71702a6d4 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -1,6 +1,8 @@ # coding: utf8 from __future__ import unicode_literals +import gc + from ...lang.en import English @@ -9,14 +11,25 @@ def test_issue1506(): def string_generator(): for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"It's sentence produced by that bug." for _ in range(10001): - yield "I erase lemmas." + yield u"I erase some hbdsaj lemmas." for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"I erase lemmas." + + for _ in range(10001): + yield u"It's sentence produced by that bug." + + for _ in range(10001): + yield u"It's sentence produced by that bug." + + for i, d in enumerate(nlp.pipe(string_generator())): + # We should run cleanup more than one time to actually cleanup data. + # In first run — clean up only mark strings as «not hitted». + if i == 10000 or i == 20000 or i == 30000: + gc.collect() - for d in nlp.pipe(string_generator()): for t in d: str(t.lemma_) diff --git a/spacy/tests/regression/test_issue1612.py b/spacy/tests/regression/test_issue1612.py new file mode 100644 index 000000000..6cae17e77 --- /dev/null +++ b/spacy/tests/regression/test_issue1612.py @@ -0,0 +1,8 @@ +# coding: utf8 +from __future__ import unicode_literals + + +def test_issue1612(en_tokenizer): + doc = en_tokenizer('The black cat purrs.') + span = doc[1: 3] + assert span.orth_ == span.text diff --git a/spacy/tests/regression/test_issue1654.py b/spacy/tests/regression/test_issue1654.py new file mode 100644 index 000000000..531c00757 --- /dev/null +++ b/spacy/tests/regression/test_issue1654.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ...language import Language +from ...vocab import Vocab + + +def test_issue1654(): + nlp = Language(Vocab()) + assert not nlp.pipeline + nlp.add_pipe(lambda doc: doc, name='1') + nlp.add_pipe(lambda doc: doc, name='2', after='1') + nlp.add_pipe(lambda doc: doc, name='3', after='2') + assert nlp.pipe_names == ['1', '2', '3'] + + nlp2 = Language(Vocab()) + assert not nlp2.pipeline + nlp2.add_pipe(lambda doc: doc, name='3') + nlp2.add_pipe(lambda doc: doc, name='2', before='3') + nlp2.add_pipe(lambda doc: doc, name='1', before='2') + assert nlp2.pipe_names == ['1', '2', '3'] diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py index b35ce94bc..94a2562fd 100644 --- a/spacy/tests/regression/test_issue910.py +++ b/spacy/tests/regression/test_issue910.py @@ -1,4 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals + import json import random import contextlib @@ -6,7 +8,7 @@ import shutil import pytest import tempfile from pathlib import Path - +from thinc.neural.optimizers import Adam from ...gold import GoldParse from ...pipeline import EntityRecognizer diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 66ecd8a8e..69a6fd38e 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -48,15 +48,17 @@ def test_displacy_parse_deps(en_vocab): """Test that deps and tags on a Doc are converted into displaCy's format.""" words = ["This", "is", "a", "sentence"] heads = [1, 0, 1, -2] + pos = ['DET', 'VERB', 'DET', 'NOUN'] tags = ['DT', 'VBZ', 'DT', 'NN'] deps = ['nsubj', 'ROOT', 'det', 'attr'] - doc = get_doc(en_vocab, words=words, heads=heads, tags=tags, deps=deps) + doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, + deps=deps) deps = parse_deps(doc) assert isinstance(deps, dict) - assert deps['words'] == [{'text': 'This', 'tag': 'DT'}, - {'text': 'is', 'tag': 'VBZ'}, - {'text': 'a', 'tag': 'DT'}, - {'text': 'sentence', 'tag': 'NN'}] + assert deps['words'] == [{'text': 'This', 'tag': 'DET'}, + {'text': 'is', 'tag': 'VERB'}, + {'text': 'a', 'tag': 'DET'}, + {'text': 'sentence', 'tag': 'NOUN'}] assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 095fbf4ad..8483b22b3 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -133,6 +133,10 @@ cdef class Tokenizer: for text in texts: yield self(text) + def _reset_cache(self, keys): + for k in keys: + del self._cache[k] + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: @@ -238,14 +242,17 @@ cdef class Tokenizer: # let's say we have dyn-o-mite-dave - the regex finds the # start and end positions of the hyphens start = 0 + start_before_infixes = start for match in matches: infix_start = match.start() infix_end = match.end() - if infix_start == start: + + if infix_start == start_before_infixes: continue - span = string[start:infix_start] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + if infix_start != start: + span = string[start:infix_start] + tokens.push_back(self.vocab.get(tokens.mem, span), False) if infix_start != infix_end: # If infix_start != infix_end, it means the infix diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 68617bb5e..4900a363d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,6 +1,7 @@ # coding: utf8 # cython: infer_types=True # cython: bounds_check=False +# cython: profile=True from __future__ import unicode_literals cimport cython @@ -543,8 +544,6 @@ cdef class Doc: assert t.lex.orth != 0 t.spacy = has_space self.length += 1 - # Set morphological attributes, e.g. by lemma, if possible - self.vocab.morphology.assign_untagged(t) return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) @@ -569,7 +568,6 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature - cdef np.ndarray[attr_t, ndim=1] attr_ids cdef np.ndarray[attr_t, ndim=2] output # Handle scalar/list inputs of strings/ints for py_attr_ids if not hasattr(py_attr_ids, '__iter__') \ @@ -581,12 +579,17 @@ cdef class Doc: for id_ in py_attr_ids] # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. - attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype='i') output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + c_output = output.data + c_attr_ids = attr_ids.data + cdef TokenC* token + cdef int nr_attr = attr_ids.shape[0] for i in range(self.length): - for j, feature in enumerate(attr_ids): - output[i, j] = get_token_attr(&self.c[i], feature) + token = &self.c[i] + for j in range(nr_attr): + c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j]) # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 4056ef615..f09dfd134 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -370,7 +370,7 @@ cdef class Span: spans = [] cdef attr_t label for start, end, label in self.doc.noun_chunks_iterator(self): - spans.append(Span(self, start, end, label=label)) + spans.append(Span(self.doc, start, end, label=label)) for span in spans: yield span @@ -527,7 +527,7 @@ cdef class Span: RETURNS (unicode): The span's text.""" def __get__(self): - return ''.join([t.orth_ for t in self]).strip() + return self.text property lemma_: """RETURNS (unicode): The span's lemma.""" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 6715c5098..c0ff8c845 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -257,7 +257,11 @@ cdef class Token: inflectional suffixes. """ def __get__(self): - return self.c.lemma + if self.c.lemma == 0: + lemma = self.vocab.morphology.lemmatizer.lookup(self.orth_) + return lemma + else: + return self.c.lemma def __set__(self, attr_t lemma): self.c.lemma = lemma @@ -724,7 +728,10 @@ cdef class Token: with no inflectional suffixes. """ def __get__(self): - return self.vocab.strings[self.c.lemma] + if self.c.lemma == 0: + return self.vocab.morphology.lemmatizer.lookup(self.orth_) + else: + return self.vocab.strings[self.c.lemma] def __set__(self, unicode lemma_): self.c.lemma = self.vocab.strings.add(lemma_) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 438a08ca6..34ad4eb67 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -467,6 +467,13 @@ cdef class Vocab: self._by_orth.set(lexeme.orth, lexeme) self.length += 1 + def _reset_cache(self, keys, strings): + for k in keys: + del self._by_hash[k] + + if len(strings) != 0: + self._by_orth = PreshMap() + def pickle_vocab(vocab): sstore = vocab.strings diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade index a51b429af..1eb2e299a 100644 --- a/website/api/_top-level/_spacy.jade +++ b/website/api/_top-level/_spacy.jade @@ -110,7 +110,7 @@ p | information about your installation, models and local setup from within | spaCy. To get the model meta data as a dictionary instead, you can | use the #[code meta] attribute on your #[code nlp] object with a - | loaded model, e.g. #[code nlp['meta']]. + | loaded model, e.g. #[code nlp.meta]. +aside-code("Example"). spacy.info() diff --git a/website/api/goldparse.jade b/website/api/goldparse.jade index c27badee9..9fb47ccc4 100644 --- a/website/api/goldparse.jade +++ b/website/api/goldparse.jade @@ -123,7 +123,7 @@ p p | Returns a list of unicode strings, describing the tags. Each tag string - | will be of the form either #[code ""], #[code "O"] or + | will be of the form of either #[code ""], #[code "O"] or | #[code "{action}-{label}"], where action is one of #[code "B"], | #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"] | is used where the entity offsets don't align with the tokenization in the @@ -135,9 +135,9 @@ p +aside-code("Example"). from spacy.gold import biluo_tags_from_offsets - text = 'I like London.' - entities = [(len('I like '), len('I like London'), 'LOC')] - doc = tokenizer(text) + + doc = nlp('I like London.') + entities = [(7, 13, 'LOC')] tags = biluo_tags_from_offsets(doc, entities) assert tags == ['O', 'O', 'U-LOC', 'O'] @@ -163,5 +163,3 @@ p +cell | Unicode strings, describing the | #[+a("/api/annotation#biluo") BILUO] tags. - - diff --git a/website/assets/js/vendor/prism.min.js b/website/assets/js/vendor/prism.min.js index 64ea8de94..97448931d 100644 --- a/website/assets/js/vendor/prism.min.js +++ b/website/assets/js/vendor/prism.min.js @@ -16,7 +16,7 @@ Prism.languages.json={property:/".*?"(?=\s*:)/gi,string:/"(?!:)(\\?[^"])*?"(?!:) !function(a){var e=/\\([^a-z()[\]]|[a-z\*]+)/i,n={"equation-command":{pattern:e,alias:"regex"}};a.languages.latex={comment:/%.*/m,cdata:{pattern:/(\\begin\{((?:verbatim|lstlisting)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0},equation:[{pattern:/\$(?:\\?[\w\W])*?\$|\\\((?:\\?[\w\W])*?\\\)|\\\[(?:\\?[\w\W])*?\\\]/,inside:n,alias:"string"},{pattern:/(\\begin\{((?:equation|math|eqnarray|align|multline|gather)\*?)\})([\w\W]*?)(?=\\end\{\2\})/,lookbehind:!0,inside:n,alias:"string"}],keyword:{pattern:/(\\(?:begin|end|ref|cite|label|usepackage|documentclass)(?:\[[^\]]+\])?\{)[^}]+(?=\})/,lookbehind:!0},url:{pattern:/(\\url\{)[^}]+(?=\})/,lookbehind:!0},headline:{pattern:/(\\(?:part|chapter|section|subsection|frametitle|subsubsection|paragraph|subparagraph|subsubparagraph|subsubsubparagraph)\*?(?:\[[^\]]+\])?\{)[^}]+(?=\}(?:\[[^\]]+\])?)/,lookbehind:!0,alias:"class-name"},"function":{pattern:e,alias:"selector"},punctuation:/[[\]{}&]/}}(Prism); Prism.languages.makefile={comment:{pattern:/(^|[^\\])#(?:\\(?:\r\n|[\s\S])|.)*/,lookbehind:!0},string:/(["'])(?:\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,builtin:/\.[A-Z][^:#=\s]+(?=\s*:(?!=))/,symbol:{pattern:/^[^:=\r\n]+(?=\s*:(?!=))/m,inside:{variable:/\$+(?:[^(){}:#=\s]+|(?=[({]))/}},variable:/\$+(?:[^(){}:#=\s]+|\([@*%<^+?][DF]\)|(?=[({]))/,keyword:[/-include\b|\b(?:define|else|endef|endif|export|ifn?def|ifn?eq|include|override|private|sinclude|undefine|unexport|vpath)\b/,{pattern:/(\()(?:addsuffix|abspath|and|basename|call|dir|error|eval|file|filter(?:-out)?|findstring|firstword|flavor|foreach|guile|if|info|join|lastword|load|notdir|or|origin|patsubst|realpath|shell|sort|strip|subst|suffix|value|warning|wildcard|word(?:s|list)?)(?=[ \t])/,lookbehind:!0}],operator:/(?:::|[?:+!])?=|[|@]/,punctuation:/[:;(){}]/}; Prism.languages.markdown=Prism.languages.extend("markup",{}),Prism.languages.insertBefore("markdown","prolog",{blockquote:{pattern:/^>(?:[\t ]*>)*/m,alias:"punctuation"},code:[{pattern:/^(?: {4}|\t).+/m,alias:"keyword"},{pattern:/``.+?``|`[^`\n]+`/,alias:"keyword"}],title:[{pattern:/\w+.*(?:\r?\n|\r)(?:==+|--+)/,alias:"important",inside:{punctuation:/==+$|--+$/}},{pattern:/(^\s*)#+.+/m,lookbehind:!0,alias:"important",inside:{punctuation:/^#+|#+$/}}],hr:{pattern:/(^\s*)([*-])([\t ]*\2){2,}(?=\s*$)/m,lookbehind:!0,alias:"punctuation"},list:{pattern:/(^\s*)(?:[*+-]|\d+\.)(?=[\t ].)/m,lookbehind:!0,alias:"punctuation"},"url-reference":{pattern:/!?\[[^\]]+\]:[\t ]+(?:\S+|<(?:\\.|[^>\\])+>)(?:[\t ]+(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\)))?/,inside:{variable:{pattern:/^(!?\[)[^\]]+/,lookbehind:!0},string:/(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\))$/,punctuation:/^[\[\]!:]|[<>]/},alias:"url"},bold:{pattern:/(^|[^\\])(\*\*|__)(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^\*\*|^__|\*\*$|__$/}},italic:{pattern:/(^|[^\\])([*_])(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^[*_]|[*_]$/}},url:{pattern:/!?\[[^\]]+\](?:\([^\s)]+(?:[\t ]+"(?:\\.|[^"\\])*")?\)| ?\[[^\]\n]*\])/,inside:{variable:{pattern:/(!?\[)[^\]]+(?=\]$)/,lookbehind:!0},string:{pattern:/"(?:\\.|[^"\\])*"(?=\)$)/}}}}),Prism.languages.markdown.bold.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.italic.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.bold.inside.italic=Prism.util.clone(Prism.languages.markdown.italic),Prism.languages.markdown.italic.inside.bold=Prism.util.clone(Prism.languages.markdown.bold); -Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/,"constant":/\b[A-Z_]{2,}\b/}; +Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:"string"},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/("|')(?:\\?.)*?\1/,"function":{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,"boolean":/\b(?:True|False|None)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/,"constant":/\b[A-Z_]{2,}\b/}; Prism.languages.rest={table:[{pattern:/(\s*)(?:\+[=-]+)+\+(?:\r?\n|\r)(?:\1(?:[+|].+)+[+|](?:\r?\n|\r))+\1(?:\+[=-]+)+\+/,lookbehind:!0,inside:{punctuation:/\||(?:\+[=-]+)+\+/}},{pattern:/(\s*)(?:=+ +)+=+((?:\r?\n|\r)\1.+)+(?:\r?\n|\r)\1(?:=+ +)+=+(?=(?:\r?\n|\r){2}|\s*$)/,lookbehind:!0,inside:{punctuation:/[=-]+/}}],"substitution-def":{pattern:/(^\s*\.\. )\|(?:[^|\s](?:[^|]*[^|\s])?)\| [^:]+::/m,lookbehind:!0,inside:{substitution:{pattern:/^\|(?:[^|\s]|[^|\s][^|]*[^|\s])\|/,alias:"attr-value",inside:{punctuation:/^\||\|$/}},directive:{pattern:/( +)[^:]+::/,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}}}},"link-target":[{pattern:/(^\s*\.\. )\[[^\]]+\]/m,lookbehind:!0,alias:"string",inside:{punctuation:/^\[|\]$/}},{pattern:/(^\s*\.\. )_(?:`[^`]+`|(?:[^:\\]|\\.)+):/m,lookbehind:!0,alias:"string",inside:{punctuation:/^_|:$/}}],directive:{pattern:/(^\s*\.\. )[^:]+::/m,lookbehind:!0,alias:"function",inside:{punctuation:/::$/}},comment:{pattern:/(^\s*\.\.)(?:(?: .+)?(?:(?:\r?\n|\r).+)+| .+)(?=(?:\r?\n|\r){2}|$)/m,lookbehind:!0},title:[{pattern:/^(([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+)(?:\r?\n|\r).+(?:\r?\n|\r)\1$/m,inside:{punctuation:/^[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+|[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}},{pattern:/(^|(?:\r?\n|\r){2}).+(?:\r?\n|\r)([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2+(?=\r?\n|\r|$)/,lookbehind:!0,inside:{punctuation:/[!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]+$/,important:/.+/}}],hr:{pattern:/((?:\r?\n|\r){2})([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\2{3,}(?=(?:\r?\n|\r){2})/,lookbehind:!0,alias:"punctuation"},field:{pattern:/(^\s*):[^:\r\n]+:(?= )/m,lookbehind:!0,alias:"attr-name"},"command-line-option":{pattern:/(^\s*)(?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?(?:, (?:[+-][a-z\d]|(?:\-\-|\/)[a-z\d-]+)(?:[ =](?:[a-z][a-z\d_-]*|<[^<>]+>))?)*(?=(?:\r?\n|\r)? {2,}\S)/im,lookbehind:!0,alias:"symbol"},"literal-block":{pattern:/::(?:\r?\n|\r){2}([ \t]+).+(?:(?:\r?\n|\r)\1.+)*/,inside:{"literal-block-punctuation":{pattern:/^::/,alias:"punctuation"}}},"quoted-literal-block":{pattern:/::(?:\r?\n|\r){2}([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]).*(?:(?:\r?\n|\r)\1.*)*/,inside:{"literal-block-punctuation":{pattern:/^(?:::|([!"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~])\1*)/m,alias:"punctuation"}}},"list-bullet":{pattern:/(^\s*)(?:[*+\-•‣⁃]|\(?(?:\d+|[a-z]|[ivxdclm]+)\)|(?:\d+|[a-z]|[ivxdclm]+)\.)(?= )/im,lookbehind:!0,alias:"punctuation"},"doctest-block":{pattern:/(^\s*)>>> .+(?:(?:\r?\n|\r).+)*/m,lookbehind:!0,inside:{punctuation:/^>>>/}},inline:[{pattern:/(^|[\s\-:\/'"<(\[{])(?::[^:]+:`.*?`|`.*?`:[^:]+:|(\*\*?|``?|\|)(?!\s).*?[^\s]\2(?=[\s\-.,:;!?\\\/'")\]}]|$))/m,lookbehind:!0,inside:{bold:{pattern:/(^\*\*).+(?=\*\*$)/,lookbehind:!0},italic:{pattern:/(^\*).+(?=\*$)/,lookbehind:!0},"inline-literal":{pattern:/(^``).+(?=``$)/,lookbehind:!0,alias:"symbol"},role:{pattern:/^:[^:]+:|:[^:]+:$/,alias:"function",inside:{punctuation:/^:|:$/}},"interpreted-text":{pattern:/(^`).+(?=`$)/,lookbehind:!0,alias:"attr-value"},substitution:{pattern:/(^\|).+(?=\|$)/,lookbehind:!0,alias:"attr-value"},punctuation:/\*\*?|``?|\|/}}],link:[{pattern:/\[[^\]]+\]_(?=[\s\-.,:;!?\\\/'")\]}]|$)/,alias:"string",inside:{punctuation:/^\[|\]_$/}},{pattern:/(?:\b[a-z\d](?:[_.:+]?[a-z\d]+)*_?_|`[^`]+`_?_|_`[^`]+`)(?=[\s\-.,:;!?\\\/'")\]}]|$)/i,alias:"string",inside:{punctuation:/^_?`|`$|`?_?_$/}}],punctuation:{pattern:/(^\s*)(?:\|(?= |$)|(?:---?|—|\.\.|__)(?= )|\.\.$)/m,lookbehind:!0}}; !function(e){e.languages.sass=e.languages.extend("css",{comment:{pattern:/^([ \t]*)\/[\/*].*(?:(?:\r?\n|\r)\1[ \t]+.+)*/m,lookbehind:!0}}),e.languages.insertBefore("sass","atrule",{"atrule-line":{pattern:/^(?:[ \t]*)[@+=].+/m,inside:{atrule:/(?:@[\w-]+|[+=])/m}}}),delete e.languages.sass.atrule;var a=/((\$[-_\w]+)|(#\{\$[-_\w]+\}))/i,t=[/[+*\/%]|[=!]=|<=?|>=?|\b(?:and|or|not)\b/,{pattern:/(\s+)-(?=\s)/,lookbehind:!0}];e.languages.insertBefore("sass","property",{"variable-line":{pattern:/^[ \t]*\$.+/m,inside:{punctuation:/:/,variable:a,operator:t}},"property-line":{pattern:/^[ \t]*(?:[^:\s]+ *:.*|:[^:\s]+.*)/m,inside:{property:[/[^:\s]+(?=\s*:)/,{pattern:/(:)[^:\s]+/,lookbehind:!0}],punctuation:/:/,variable:a,operator:t,important:e.languages.sass.important}}}),delete e.languages.sass.property,delete e.languages.sass.important,delete e.languages.sass.selector,e.languages.insertBefore("sass","punctuation",{selector:{pattern:/([ \t]*)\S(?:,?[^,\r\n]+)*(?:,(?:\r?\n|\r)\1[ \t]+\S(?:,?[^,\r\n]+)*)*/,lookbehind:!0}})}(Prism); Prism.languages.scss=Prism.languages.extend("css",{comment:{pattern:/(^|[^\\])(?:\/\*[\w\W]*?\*\/|\/\/.*)/,lookbehind:!0},atrule:{pattern:/@[\w-]+(?:\([^()]+\)|[^(])*?(?=\s+[{;])/,inside:{rule:/@[\w-]+/}},url:/(?:[-a-z]+-)*url(?=\()/i,selector:{pattern:/(?=\S)[^@;\{\}\(\)]?([^@;\{\}\(\)]|&|#\{\$[-_\w]+\})+(?=\s*\{(\}|\s|[^\}]+(:|\{)[^\}]+))/m,inside:{placeholder:/%[-_\w]+/}}}),Prism.languages.insertBefore("scss","atrule",{keyword:[/@(?:if|else(?: if)?|for|each|while|import|extend|debug|warn|mixin|include|function|return|content)/i,{pattern:/( +)(?:from|through)(?= )/,lookbehind:!0}]}),Prism.languages.insertBefore("scss","property",{variable:/\$[-_\w]+|#\{\$[-_\w]+\}/}),Prism.languages.insertBefore("scss","function",{placeholder:{pattern:/%[-_\w]+/,alias:"selector"},statement:/\B!(?:default|optional)\b/i,"boolean":/\b(?:true|false)\b/,"null":/\bnull\b/,operator:{pattern:/(\s)(?:[-+*\/%]|[=!]=|<=?|>=?|and|or|not)(?=\s)/,lookbehind:!0}}),Prism.languages.scss.atrule.inside.rest=Prism.util.clone(Prism.languages.scss); diff --git a/website/index.jade b/website/index.jade index 8a77ae5fe..cd8eecfa9 100644 --- a/website/index.jade +++ b/website/index.jade @@ -60,8 +60,8 @@ include _includes/_mixins # Load English tokenizer, tagger, parser, NER and word vectors nlp = spacy.load('en') - # Process a document, of any size - text = open('war_and_peace.txt').read() + # Process whole documents + text = open('customer_feedback_627.txt').read() doc = nlp(text) # Find named entities, phrases and concepts diff --git a/website/usage/_linguistic-features/_dependency-parse.jade b/website/usage/_linguistic-features/_dependency-parse.jade index b86ce5d0f..0b6cc53ba 100644 --- a/website/usage/_linguistic-features/_dependency-parse.jade +++ b/website/usage/_linguistic-features/_dependency-parse.jade @@ -29,7 +29,7 @@ p | #[strong Text:] The original noun chunk text.#[br] | #[strong Root text:] The original text of the word connecting the noun | chunk to the rest of the parse.#[br] - | #[strong Root dep:] Dependcy relation connecting the root to its head.#[br] + | #[strong Root dep:] Dependency relation connecting the root to its head.#[br] | #[strong Root head text:] The text of the root token's head.#[br] +table(["Text", "root.text", "root.dep_", "root.head.text"]) diff --git a/website/usage/_linguistic-features/_rule-based-matching.jade b/website/usage/_linguistic-features/_rule-based-matching.jade index aa81106e6..d2bead022 100644 --- a/website/usage/_linguistic-features/_rule-based-matching.jade +++ b/website/usage/_linguistic-features/_rule-based-matching.jade @@ -354,7 +354,8 @@ p # append mock entity for match in displaCy style to matched_sents # get the match span by ofsetting the start and end of the span with the # start and end of the sentence in the doc - match_ents = [{'start': span.start-sent.start, 'end': span.end-sent.start, + match_ents = [{'start': span.start_char - sent.start_char, + 'end': span.end_char - sent.start_char, 'label': 'MATCH'}] matched_sents.append({'text': sent.text, 'ents': match_ents }) diff --git a/website/usage/_models/_production.jade b/website/usage/_models/_production.jade index 43f4b1ba9..1c64b37e5 100644 --- a/website/usage/_models/_production.jade +++ b/website/usage/_models/_production.jade @@ -33,7 +33,7 @@ p +code("requirements.txt", "text"). spacy>=2.0.0,<3.0.0 - -e #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#en_core_web_sm + #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#en_core_web_sm p | Specifying #[code #egg=] with the package name tells pip diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade index 71e88ca3d..7603297a5 100644 --- a/website/usage/_processing-pipelines/_custom-components.jade +++ b/website/usage/_processing-pipelines/_custom-components.jade @@ -53,11 +53,11 @@ p class MyComponent(object): name = 'print_info' - def __init__(vocab, short_limit=10): - self.vocab = nlp.vocab + def __init__(self, vocab, short_limit=10): + self.vocab = vocab self.short_limit = short_limit - def __call__(doc): + def __call__(self, doc): if len(doc) < self.short_limit: print("This is a pretty short document.") return doc @@ -100,7 +100,10 @@ p | Set a default value for an attribute, which can be overwritten | manually at any time. Attribute extensions work like "normal" | variables and are the quickest way to store arbitrary information - | on a #[code Doc], #[code Span] or #[code Token]. + | on a #[code Doc], #[code Span] or #[code Token]. Attribute defaults + | behaves just like argument defaults + | #[+a("http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments") in Python functions], + | and should not be used for mutable values like dictionaries or lists. +code-wrapper +code. diff --git a/website/usage/_processing-pipelines/_extensions.jade b/website/usage/_processing-pipelines/_extensions.jade index fb46fe330..f0bc4c249 100644 --- a/website/usage/_processing-pipelines/_extensions.jade +++ b/website/usage/_processing-pipelines/_extensions.jade @@ -36,6 +36,24 @@ p if token.text in ('apple', 'orange'): token._.set('is_fruit', True) + +item + | When using #[strong mutable values] like dictionaries or lists as + | the #[code default] argument, keep in mind that they behave just like + | mutable default arguments + | #[+a("http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments") in Python functions]. + | This can easily cause unintended results, like the same value being + | set on #[em all] objects instead of only one particular instance. + | In most cases, it's better to use #[strong getters and setters], and + | only set the #[code default] for boolean or string values. + + +code-wrapper + +code-new. + Doc.set_extension('fruits', getter=get_fruits, setter=set_fruits) + + +code-old. + Doc.set_extension('fruits', default={}) + doc._.fruits['apple'] = u'🍎' # all docs now have {'apple': u'🍎'} + +item | Always add your custom attributes to the #[strong global] #[code Doc] | #[code Token] or #[code Span] objects, not a particular instance of diff --git a/website/usage/_spacy-101/_lightning-tour.jade b/website/usage/_spacy-101/_lightning-tour.jade index 493c15d35..f8dcc7ec5 100644 --- a/website/usage/_spacy-101/_lightning-tour.jade +++ b/website/usage/_spacy-101/_lightning-tour.jade @@ -82,7 +82,7 @@ p unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' +infobox - | #[+label-inline API:] #[+api("stringstore") #[code stringstore]] + | #[+label-inline API:] #[+api("stringstore") #[code StringStore]] | #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101] +h(3, "lightning-tour-entities") Recognise and update named entities @@ -102,6 +102,28 @@ p +infobox | #[+label-inline Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition] ++h(3, "lightning-tour-training") Train and update neural network models + +tag-model + ++code. + import spacy + import random + + nlp = spacy.load('en') + train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'ORG')]})] + + with nlp.disable_pipes([pipe for pipe in nlp.pipe_names if pipe != 'ner']): + optimizer = nlp.begin_training() + for i in range(10): + random.shuffle(train_data) + for text, annotations in train_data: + nlp.update([text], [annotations] sgd=optimizer) + nlp.to_disk('/model') + ++infobox + | #[+label-inline API:] #[+api("language#update") #[code Language.update]] + | #[+label-inline Usage:] #[+a("/usage/training") Training spaCy's statistical models] + +h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser +tag-model("dependency parse", "NER") +tag-new(2) @@ -183,11 +205,11 @@ p from spacy.vocab import Vocab nlp = spacy.load('en') - moby_dick = open('moby_dick.txt', 'r').read() - doc = nlp(moby_dick) - doc.to_disk('/moby_dick.bin') + customer_feedback = open('customer_feedback_627.txt').read() + doc = nlp(customer_feedback) + doc.to_disk('/tmp/customer_feedback_627.bin') - new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') + new_doc = Doc(Vocab()).from_disk('/tmp/customer_feedback_627.bin') +infobox | #[+label-inline API:] #[+api("language") #[code Language]], @@ -210,7 +232,8 @@ p pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']] matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji - matches = nlp(LOTS_OF TEXT) + text = open('customer_feedback_627.txt').read() + matches = nlp(text) +infobox | #[+label-inline API:] #[+api("matcher") #[code Matcher]] diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade index 734495c6e..7e3f7b0e9 100644 --- a/website/usage/_vectors-similarity/_basics.jade +++ b/website/usage/_vectors-similarity/_basics.jade @@ -113,7 +113,7 @@ p p | Interestingly, "man bites dog" and "man dog bites" are seen as slightly | more similar than "man bites dog" and "dog bites man". This may be a - | conincidence – or the result of "man" being interpreted as both sentence's + | coincidence – or the result of "man" being interpreted as both sentence's | subject. +table diff --git a/website/usage/resources.jade b/website/usage/resources.jade index d6afcd82f..536a92cf8 100644 --- a/website/usage/resources.jade +++ b/website/usage/resources.jade @@ -22,6 +22,12 @@ include ../_includes/_mixins +card("textacy", "https://github.com/chartbeat-labs/textacy", "Burton DeWilde", "github") | Higher-level NLP built on spaCy. + +card("mordecai", "https://github.com/openeventdata/mordecai", "Andy Halterman", "github") + | Full text geoparsing using spaCy, Geonames and Keras. + + +card("kindred", "https://github.com/jakelever/kindred", "Jake Lever", "github") + | Biomedical relation extraction using spaCy. + +card("spacyr", "https://github.com/kbenoit/spacyr", "Kenneth Benoit", "github") | An R wrapper for spaCy. @@ -55,6 +61,14 @@ include ../_includes/_mixins | Pipeline component for emoji handling and adding emoji meta data | to #[code Doc], #[code Token] and #[code Span] attributes. + +card("spacy_hunspell", "https://github.com/tokestermw/spacy_hunspell", "Motoki Wu", "github") + | Add spellchecking and spelling suggestions to your spaCy pipeline + | using Hunspell. + + +card("spacy_cld", "https://github.com/nickdavidhaynes/spacy-cld", "Nicholas D Haynes", "github") + | Add language detection to your spaCy pipeline using Compact + | Language Detector 2 via PYCLD2. + .u-text-right +button("https://github.com/topics/spacy-extension?o=desc&s=stars", false, "primary", "small") See more extensions on GitHub