diff --git a/.appveyor.yml b/.appveyor.yml index 4dd7b0a31..a379cdd31 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1 +1,55 @@ +environment: + + matrix: + + # For Python versions available on Appveyor, see + # http://www.appveyor.com/docs/installed-software#python + # The list here is complete (excluding Python 2.6, which + # isn't covered by this document) at the time of writing. + + - PYTHON: "C:\\Python27" + #- PYTHON: "C:\\Python33" + #- PYTHON: "C:\\Python34" + #- PYTHON: "C:\\Python35" + #- PYTHON: "C:\\Python27-x64" + #- PYTHON: "C:\\Python33-x64" + #- DISTUTILS_USE_SDK: "1" + #- PYTHON: "C:\\Python34-x64" + #- DISTUTILS_USE_SDK: "1" + #- PYTHON: "C:\\Python35-x64" + - PYTHON: "C:\\Python36-x64" + +install: + # We need wheel installed to build wheels + - "%PYTHON%\\python.exe -m pip install wheel" + - "%PYTHON%\\python.exe -m pip install cython" + - "%PYTHON%\\python.exe -m pip install -r requirements.txt" + - "%PYTHON%\\python.exe -m pip install -e ." + build: off + +test_script: + # Put your test command here. + # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4, + # you can remove "build.cmd" from the front of the command, as it's + # only needed to support those cases. + # Note that you must use the environment variable %PYTHON% to refer to + # the interpreter you're using - Appveyor does not do anything special + # to put the Python version you want to use on PATH. + - "%PYTHON%\\python.exe -m pytest spacy/" + +after_test: + # This step builds your wheels. + # Again, you only need build.cmd if you're building C extensions for + # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct + # interpreter + - "%PYTHON%\\python.exe setup.py bdist_wheel" + +artifacts: + # bdist_wheel puts your built wheel in the dist directory + - path: dist\* + +#on_success: +# You can use this step to upload your artifacts to a public website. +# See Appveyor's documentation for more details. Or you can simply +# access your wheels from the Appveyor "artifacts" tab for your build. diff --git a/.buildkite/sdist.yml b/.buildkite/sdist.yml new file mode 100644 index 000000000..9b94e3752 --- /dev/null +++ b/.buildkite/sdist.yml @@ -0,0 +1,11 @@ +steps: + - + command: "fab env clean make test sdist" + label: ":dizzy: :python:" + artifact_paths: "dist/*.tar.gz" + - wait + - trigger: "spacy-sdist-against-models" + label: ":dizzy: :hammer:" + build: + env: + SPACY_VERSION: "{$SPACY_VERSION}" diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index c915d48bf..f34603065 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,8 +87,8 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person - or entity, including my employer, has or will have rights with respect my + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my contributions. * [ ] I am signing on behalf of my employer or a legal entity and I have the @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shuvanon Razik | +| Name | | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 3/12/2017 | -| GitHub username | shuvanon | +| Date | | +| GitHub username | | | Website (optional) | | diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e97a7ea16..ec11b78bd 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,20 +1,19 @@ - + ## Description - - + +### Types of change + -## Types of changes - -- [ ] **Bug fix** (non-breaking change fixing an issue) -- [ ] **New feature** (non-breaking change adding functionality to spaCy) -- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) -- [ ] **Documentation** (addition to documentation of spaCy) - -## Checklist: - -- [ ] My change requires a change to spaCy's documentation. -- [ ] I have updated the documentation accordingly. -- [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. +## Checklist + +- [ ] I have submitted the spaCy Contributor Agreement. +- [ ] I ran the tests, and all new and existing tests passed. +- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. diff --git a/.github/contributors/demfier.md b/.github/contributors/demfier.md new file mode 100644 index 000000000..1a730fc78 --- /dev/null +++ b/.github/contributors/demfier.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Gaurav Sahu | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-18 | +| GitHub username | demfier | +| Website (optional) | | diff --git a/.github/contributors/honnibal.md b/.github/contributors/honnibal.md new file mode 100644 index 000000000..3a700b7dd --- /dev/null +++ b/.github/contributors/honnibal.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthew Honnibal | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017-10-18 | +| GitHub username | honnibal | +| Website (optional) | https://explosion.ai | diff --git a/.github/contributors/ines.md b/.github/contributors/ines.md new file mode 100644 index 000000000..5cd57b07e --- /dev/null +++ b/.github/contributors/ines.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ines Montani | +| Company name (if applicable) | Explosion AI | +| Title or role (if applicable) | Founder | +| Date | 2017/10/18 | +| GitHub username | ines | +| Website (optional) | https://explosion.ai | diff --git a/.github/contributors/jerbob92.md b/.github/contributors/jerbob92.md new file mode 100644 index 000000000..bb0430d14 --- /dev/null +++ b/.github/contributors/jerbob92.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jeroen Bobbeldijk | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22-10-2017 | +| GitHub username | jerbob92 | +| Website (optional) | | diff --git a/.github/contributors/johnhaley81.md b/.github/contributors/johnhaley81.md new file mode 100644 index 000000000..277b3126c --- /dev/null +++ b/.github/contributors/johnhaley81.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | John Haley | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 19/10/2017 | +| GitHub username | johnhaley81 | +| Website (optional) | | diff --git a/.github/contributors/mdcclv.md b/.github/contributors/mdcclv.md new file mode 100644 index 000000000..14ebfae26 --- /dev/null +++ b/.github/contributors/mdcclv.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------------- | +| Name | Orion Montoya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 04-10-2017 | +| GitHub username | mdcclv | +| Website (optional) | http://www.mdcclv.com/ | diff --git a/.github/contributors/polm.md b/.github/contributors/polm.md new file mode 100644 index 000000000..a2aa0cb65 --- /dev/null +++ b/.github/contributors/polm.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Paul McCann | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-14 | +| GitHub username | polm | +| Website (optional) | http://dampfkraft.com| diff --git a/.github/contributors/ramananbalakrishnan.md b/.github/contributors/ramananbalakrishnan.md new file mode 100644 index 000000000..804c41f56 --- /dev/null +++ b/.github/contributors/ramananbalakrishnan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ramanan Balakrishnan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-19 | +| GitHub username | ramananbalakrishnan | +| Website (optional) | | diff --git a/.github/contributors/shuvanon.md b/.github/contributors/shuvanon.md new file mode 100644 index 000000000..82d02d8d2 --- /dev/null +++ b/.github/contributors/shuvanon.md @@ -0,0 +1,108 @@ + + +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Shuvanon Razik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 3/12/2017 | +| GitHub username | shuvanon | +| Website (optional) | | diff --git a/.github/contributors/yuukos.md b/.github/contributors/yuukos.md new file mode 100644 index 000000000..aecafeecb --- /dev/null +++ b/.github/contributors/yuukos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Alexey Kim | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13-12-2017 | +| GitHub username | yuukos | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index cb0a8e84e..14097dfcd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,12 @@ # spaCy spacy/data/ corpora/ -models/ +/models/ keys/ # Website website/www/ website/_deploy.sh -website/package.json -website/announcement.jade website/.gitignore # Cython / C extensions diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b64dc8db3..edd1ed30d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -3,6 +3,8 @@ This is a list of everyone who has made significant contributions to spaCy, in alphabetical order. Thanks a lot for the great work! * Adam Bittlingmayer, [@bittlingmayer](https://github.com/bittlingmayer) +* Alexey Kim, [@yuukos](https://github.com/yuukos) +* Alexis Eidelman, [@AlexisEidelman](https://github.com/AlexisEidelman) * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) @@ -16,6 +18,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo) * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28) +* Francisco Aranda, [@frascuchon](https://github.com/frascuchon) * Greg Baker, [@solresol](https://github.com/solresol) * Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard) * György Orosz, [@oroszgy](https://github.com/oroszgy) @@ -24,6 +27,9 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Ines Montani, [@ines](https://github.com/ines) * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) +* Jim Geovedi, [@geovedi](https://github.com/geovedi) +* Jim Regan, [@jimregan](https://github.com/jimregan) +* Jeffrey Gerard, [@IamJeffG](https://github.com/IamJeffG) * Jordan Suchow, [@suchow](https://github.com/suchow) * Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) @@ -38,6 +44,8 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Michael Wallin, [@wallinm1](https://github.com/wallinm1) * Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Orion Montoya, [@mdcclv](https://github.com/mdcclv) +* Paul O'Leary McCann, [@polm](https://github.com/polm) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) @@ -45,12 +53,18 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov, [@savkov](https://github.com/savkov) * Shuvanon Razik, [@shuvanon](https://github.com/shuvanon) +* Swier, [@swierh](https://github.com/swierh) * Thomas Tanon, [@Tpt](https://github.com/Tpt) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) +* Vimos Tan, [@Vimos](https://github.com/Vimos) * Vsevolod Solovyov, [@vsolovyov](https://github.com/vsolovyov) * Wah Loon Keng, [@kengz](https://github.com/kengz) +* Wannaphong Phatthiyaphaibun, [@wannaphongcom](https://github.com/wannaphongcom) * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) +* Yam, [@hscspring](https://github.com/hscspring) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) * Yasuaki Uechi, [@uetchy](https://github.com/uetchy) +* Yu-chun Huang, [@galaxyh](https://github.com/galaxyh) * Yubing Dong, [@tomtung](https://github.com/tomtung) +* Yuval Pinter, [@yuvalpinter](https://github.com/yuvalpinter) diff --git a/README.rst b/README.rst index 244308473..a503abbc0 100644 --- a/README.rst +++ b/README.rst @@ -1,15 +1,16 @@ spaCy: Industrial-strength NLP ****************************** -spaCy is a library for advanced natural language processing in Python and -Cython. spaCy is built on the very latest research, but it isn't researchware. -It was designed from day one to be used in real products. spaCy currently supports -English, German, French and Spanish, as well as tokenization for Italian, -Portuguese, Dutch, Swedish, Finnish, Norwegian, Danish, Hungarian, Polish, -Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software, -released under the MIT license. +spaCy is a library for advanced Natural Language Processing in Python and Cython. +It's built on the very latest research, and was designed from day one to be +used in real products. spaCy comes with +`pre-trained statistical models `_ and word +vectors, and currently supports tokenization for **20+ languages**. It features +the **fastest syntactic parser** in the world, convolutional **neural network models** +for tagging, parsing and **named entity recognition** and easy **deep learning** +integration. It's commercial open-source software, released under the MIT license. -💫 **Version 1.8 out now!** `Read the release notes here. `_ +💫 **Version 2.0 out now!** `Check out the new features here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy @@ -38,68 +39,72 @@ released under the MIT license. 📖 Documentation ================ -=================== === -`Usage Workflows`_ How to use spaCy and its features. -`API Reference`_ The detailed reference for spaCy's API. -`Troubleshooting`_ Common problems and solutions for beginners. -`Tutorials`_ End-to-end examples, with code you can modify and run. -`Showcase & Demos`_ Demos, libraries and products from the spaCy community. -`Contribute`_ How to contribute to the spaCy project and code base. -=================== === +=================== === +`spaCy 101`_ New to spaCy? Here's everything you need to know! +`Usage Guides`_ How to use spaCy and its features. +`New in v2.0`_ New features, backwards incompatibilities and migration guide. +`API Reference`_ The detailed reference for spaCy's API. +`Models`_ Download statistical language models for spaCy. +`Resources`_ Libraries, extensions, demos, books and courses. +`Changelog`_ Changes and version history. +`Contribute`_ How to contribute to the spaCy project and code base. +=================== === -.. _Usage Workflows: https://spacy.io/docs/usage/ -.. _API Reference: https://spacy.io/docs/api/ -.. _Troubleshooting: https://spacy.io/docs/usage/troubleshooting -.. _Tutorials: https://spacy.io/docs/usage/tutorials -.. _Showcase & Demos: https://spacy.io/docs/usage/showcase +.. _spaCy 101: https://alpha.spacy.io/usage/spacy-101 +.. _New in v2.0: https://alpha.spacy.io/usage/v2#migrating +.. _Usage Guides: https://alpha.spacy.io/usage/ +.. _API Reference: https://alpha.spacy.io/api/ +.. _Models: https://alpha.spacy.io/models +.. _Resources: https://alpha.spacy.io/usage/resources +.. _Changelog: https://alpha.spacy.io/usage/#changelog .. _Contribute: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md 💬 Where to ask questions ========================== +The spaCy project is maintained by `@honnibal `_ +and `@ines `_. Please understand that we won't be able +to provide individual support via email. We also believe that help is much more +valuable if it's shared publicly, so that more people can benefit from it. + ====================== === -**Bug reports** `GitHub issue tracker`_ -**Usage questions** `StackOverflow`_, `Gitter chat`_, `Reddit user group`_ -**General discussion** `Gitter chat`_, `Reddit user group`_ -**Commercial support** contact@explosion.ai +**Bug Reports** `GitHub Issue Tracker`_ +**Usage Questions** `StackOverflow`_, `Gitter Chat`_, `Reddit User Group`_ +**General Discussion** `Gitter Chat`_, `Reddit User Group`_ ====================== === -.. _GitHub issue tracker: https://github.com/explosion/spaCy/issues +.. _GitHub Issue Tracker: https://github.com/explosion/spaCy/issues .. _StackOverflow: http://stackoverflow.com/questions/tagged/spacy -.. _Gitter chat: https://gitter.im/explosion/spaCy -.. _Reddit user group: https://www.reddit.com/r/spacynlp +.. _Gitter Chat: https://gitter.im/explosion/spaCy +.. _Reddit User Group: https://www.reddit.com/r/spacynlp Features ======== -* Non-destructive **tokenization** -* Syntax-driven sentence segmentation -* Pre-trained **word vectors** -* Part-of-speech tagging +* **Fastest syntactic parser** in the world * **Named entity** recognition -* Labelled dependency parsing -* Convenient string-to-int mapping -* Export to numpy data arrays -* GIL-free **multi-threading** -* Efficient binary serialization +* Non-destructive **tokenization** +* Support for **20+ languages** +* Pre-trained `statistical models `_ and word vectors * Easy **deep learning** integration -* Statistical models for **English**, **German**, **French** and **Spanish** +* Part-of-speech tagging +* Labelled dependency parsing +* Syntax-driven sentence segmentation +* Built in **visualizers** for syntax and NER +* Convenient string-to-hash mapping +* Export to numpy data arrays +* Efficient binary serialization +* Easy **model packaging** and deployment * State-of-the-art speed * Robust, rigorously evaluated accuracy -See `facts, figures and benchmarks `_. +📖 **For more details, see the** `facts, figures and benchmarks `_. -Top Performance ---------------- +Install spaCy +============= -* Fastest in the world: <50ms per document. No faster system has ever been - announced. -* Accuracy within 1% of the current state of the art on all tasks performed - (parsing, named entity recognition, part-of-speech tagging). The only more - accurate systems are an order of magnitude slower or more. - -Supports --------- +For detailed installation instructions, see +the `documentation `_. ==================== === **Operating system** macOS / OS X, Linux, Windows (Cygwin, MinGW, Visual Studio) @@ -110,12 +115,6 @@ Supports .. _pip: https://pypi.python.org/pypi/spacy .. _conda: https://anaconda.org/conda-forge/spacy -Install spaCy -============= - -Installation requires a working build environment. See notes on Ubuntu, -macOS/OS X and Windows for details. - pip --- @@ -123,7 +122,7 @@ Using pip, spaCy releases are currently only available as source packages. .. code:: bash - pip install -U spacy + pip install spacy When using pip it is generally recommended to install packages in a ``virtualenv`` to avoid modifying system state: @@ -149,25 +148,41 @@ For the feedstock including the build recipe and configuration, check out `this repository `_. Improvements and pull requests to the recipe and setup are always appreciated. +Updating spaCy +-------------- + +Some updates to spaCy may require downloading new statistical models. If you're +running spaCy v2.0 or higher, you can use the ``validate`` command to check if +your installed models are compatible and if not, print details on how to update +them: + +.. code:: bash + + pip install -U spacy + spacy validate + +If you've trained your own models, keep in mind that your training and runtime +inputs must match. After updating spaCy, we recommend **retraining your models** +with the new version. + +📖 **For details on upgrading from spaCy 1.x to spaCy 2.x, see the** +`migration guide `_. + Download models =============== As of v1.7.0, models for spaCy can be installed as **Python packages**. This means that they're a component of your application, just like any -other module. They're versioned and can be defined as a dependency in your -``requirements.txt``. Models can be installed from a download URL or -a local directory, manually or via pip. Their data can be located anywhere on -your file system. To make a model available to spaCy, all you need to do is -create a "shortcut link", an internal alias that tells spaCy where to find the -data files for a specific model name. +other module. Models can be installed using spaCy's ``download`` command, +or manually by pointing pip to a path or URL. ======================= === -`spaCy Models`_ Available models, latest releases and direct download. +`Available Models`_ Detailed model descriptions, accuracy figures and benchmarks. `Models Documentation`_ Detailed usage instructions. ======================= === -.. _spaCy Models: https://github.com/explosion/spacy-models/releases/ -.. _Models Documentation: https://spacy.io/docs/usage/models +.. _Available Models: https://alpha.spacy.io/models +.. _Models Documentation: https://alpha.spacy.io/docs/usage/models .. code:: bash @@ -175,17 +190,10 @@ data files for a specific model name. python -m spacy download en # download best-matching version of specific model for your spaCy installation - python -m spacy download en_core_web_md + python -m spacy download en_core_web_lg # pip install .tar.gz archive from path or URL - pip install /Users/you/en_core_web_md-1.2.0.tar.gz - pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz - - # set up shortcut link to load installed package as "en_default" - python -m spacy link en_core_web_md en_default - - # set up shortcut link to load local model as "my_amazing_model" - python -m spacy link /Users/you/data my_amazing_model + pip install /Users/you/en_core_web_sm-2.0.0.tar.gz Loading and using models ------------------------ @@ -199,24 +207,24 @@ To load a model, use ``spacy.load()`` with the model's shortcut link: doc = nlp(u'This is a sentence.') If you've installed a model via pip, you can also ``import`` it directly and -then call its ``load()`` method with no arguments. This should also work for -older models in previous versions of spaCy. +then call its ``load()`` method: .. code:: python import spacy - import en_core_web_md + import en_core_web_sm - nlp = en_core_web_md.load() + nlp = en_core_web_.load() doc = nlp(u'This is a sentence.') -📖 **For more info and examples, check out the** `models documentation `_. +📖 **For more info and examples, check out the** +`models documentation `_. Support for older versions -------------------------- -If you're using an older version (v1.6.0 or below), you can still download and -install the old models from within spaCy using ``python -m spacy.en.download all`` +If you're using an older version (``v1.6.0`` or below), you can still download +and install the old models from within spaCy using ``python -m spacy.en.download all`` or ``python -m spacy.de.download all``. The ``.tar.gz`` archives are also `attached to the v1.6.0 release `_. To download and install the models manually, unpack the archive, drop the @@ -248,11 +256,13 @@ details. pip install -r requirements.txt pip install -e . -Compared to regular install via pip `requirements.txt `_ +Compared to regular install via pip, `requirements.txt `_ additionally installs developer dependencies such as Cython. - Instead of the above verbose commands, you can also use the following -`Fabric `_ commands: +`Fabric `_ commands. All commands assume that your +``virtualenv`` is located in a directory ``.env``. If you're using a different +directory, you can change it via the environment variable ``VENV_DIR``, for +example ``VENV_DIR=".custom-env" fab clean make``. ============= === ``fab env`` Create ``virtualenv`` and delete previous one, if it exists. @@ -261,14 +271,6 @@ Instead of the above verbose commands, you can also use the following ``fab test`` Run basic tests, aborting after first failure. ============= === -All commands assume that your ``virtualenv`` is located in a directory ``.env``. -If you're using a different directory, you can change it via the environment -variable ``VENV_DIR``, for example: - -.. code:: bash - - VENV_DIR=".custom-env" fab clean make - Ubuntu ------ @@ -310,76 +312,4 @@ and ``--model`` are optional and enable additional tests: # make sure you are using recent pytest version python -m pip install -U pytest - python -m pytest - -🛠 Changelog -============ - -=========== ============== =========== -Version Date Description -=========== ============== =========== -`v1.8.2`_ ``2017-04-26`` French model and small improvements -`v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes -`v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading -`v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands -`v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes -`v1.7.2`_ ``2017-03-20`` Small fixes to beam parser and model linking -`v1.7.1`_ ``2017-03-19`` Fix data download for system installation -`v1.7.0`_ ``2017-03-18`` New 50 MB model, CLI, better downloads and lots of bug fixes -`v1.6.0`_ ``2017-01-16`` Improvements to tokenizer and tests -`v1.5.0`_ ``2016-12-27`` Alpha support for Swedish and Hungarian -`v1.4.0`_ ``2016-12-18`` Improved language data and alpha Dutch support -`v1.3.0`_ ``2016-12-03`` Improve API consistency -`v1.2.0`_ ``2016-11-04`` Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese -`v1.1.0`_ ``2016-10-23`` Bug fixes and adjustments -`v1.0.0`_ ``2016-10-18`` Support for deep learning workflows and entity-aware rule matcher -`v0.101.0`_ ``2016-05-10`` Fixed German model -`v0.100.7`_ ``2016-05-05`` German support -`v0.100.6`_ ``2016-03-08`` Add support for GloVe vectors -`v0.100.5`_ ``2016-02-07`` Fix incorrect use of header file -`v0.100.4`_ ``2016-02-07`` Fix OSX problem introduced in 0.100.3 -`v0.100.3`_ ``2016-02-06`` Multi-threading, faster loading and bugfixes -`v0.100.2`_ ``2016-01-21`` Fix data version lock -`v0.100.1`_ ``2016-01-21`` Fix install for OSX -`v0.100`_ ``2016-01-19`` Revise setup.py, better model downloads, bug fixes -`v0.99`_ ``2015-11-08`` Improve span merging, internal refactoring -`v0.98`_ ``2015-11-03`` Smaller package, bug fixes -`v0.97`_ ``2015-10-23`` Load the StringStore from a json list, instead of a text file -`v0.96`_ ``2015-10-19`` Hotfix to .merge method -`v0.95`_ ``2015-10-18`` Bug fixes -`v0.94`_ ``2015-10-09`` Fix memory and parse errors -`v0.93`_ ``2015-09-22`` Bug fixes to word vectors -=========== ============== =========== - -.. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2 -.. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 -.. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 -.. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 -.. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 -.. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2 -.. _v1.7.1: https://github.com/explosion/spaCy/releases/tag/v1.7.1 -.. _v1.7.0: https://github.com/explosion/spaCy/releases/tag/v1.7.0 -.. _v1.6.0: https://github.com/explosion/spaCy/releases/tag/v1.6.0 -.. _v1.5.0: https://github.com/explosion/spaCy/releases/tag/v1.5.0 -.. _v1.4.0: https://github.com/explosion/spaCy/releases/tag/v1.4.0 -.. _v1.3.0: https://github.com/explosion/spaCy/releases/tag/v1.3.0 -.. _v1.2.0: https://github.com/explosion/spaCy/releases/tag/v1.2.0 -.. _v1.1.0: https://github.com/explosion/spaCy/releases/tag/v1.1.0 -.. _v1.0.0: https://github.com/explosion/spaCy/releases/tag/v1.0.0 -.. _v0.101.0: https://github.com/explosion/spaCy/releases/tag/0.101.0 -.. _v0.100.7: https://github.com/explosion/spaCy/releases/tag/0.100.7 -.. _v0.100.6: https://github.com/explosion/spaCy/releases/tag/0.100.6 -.. _v0.100.5: https://github.com/explosion/spaCy/releases/tag/0.100.5 -.. _v0.100.4: https://github.com/explosion/spaCy/releases/tag/0.100.4 -.. _v0.100.3: https://github.com/explosion/spaCy/releases/tag/0.100.3 -.. _v0.100.2: https://github.com/explosion/spaCy/releases/tag/0.100.2 -.. _v0.100.1: https://github.com/explosion/spaCy/releases/tag/0.100.1 -.. _v0.100: https://github.com/explosion/spaCy/releases/tag/0.100 -.. _v0.99: https://github.com/explosion/spaCy/releases/tag/0.99 -.. _v0.98: https://github.com/explosion/spaCy/releases/tag/0.98 -.. _v0.97: https://github.com/explosion/spaCy/releases/tag/0.97 -.. _v0.96: https://github.com/explosion/spaCy/releases/tag/0.96 -.. _v0.95: https://github.com/explosion/spaCy/releases/tag/0.95 -.. _v0.94: https://github.com/explosion/spaCy/releases/tag/0.94 -.. _v0.93: https://github.com/explosion/spaCy/releases/tag/0.93 diff --git a/bin/get_freqs.py b/bin/get_freqs.py deleted file mode 100755 index 54d90ef8c..000000000 --- a/bin/get_freqs.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -from __future__ import unicode_literals, print_function - -import plac -import joblib -from os import path -import os -import bz2 -import ujson -from preshed.counter import PreshCounter -from joblib import Parallel, delayed -import io - -from spacy.en import English -from spacy.strings import StringStore -from spacy.attrs import ORTH -from spacy.tokenizer import Tokenizer -from spacy.vocab import Vocab - - -def iter_comments(loc): - with bz2.BZ2File(loc) as file_: - for line in file_: - yield ujson.loads(line) - - -def count_freqs(input_loc, output_loc): - print(output_loc) - vocab = English.default_vocab(get_lex_attr=None) - tokenizer = Tokenizer.from_dir(vocab, - path.join(English.default_data_dir(), 'tokenizer')) - - counts = PreshCounter() - for json_comment in iter_comments(input_loc): - doc = tokenizer(json_comment['body']) - doc.count_by(ORTH, counts=counts) - - with io.open(output_loc, 'w', 'utf8') as file_: - for orth, freq in counts: - string = tokenizer.vocab.strings[orth] - if not string.isspace(): - file_.write('%d\t%s\n' % (freq, string)) - - -def parallelize(func, iterator, n_jobs): - Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator) - - -def merge_counts(locs, out_loc): - string_map = StringStore() - counts = PreshCounter() - for loc in locs: - with io.open(loc, 'r', encoding='utf8') as file_: - for line in file_: - freq, word = line.strip().split('\t', 1) - orth = string_map[word] - counts.inc(orth, int(freq)) - with io.open(out_loc, 'w', encoding='utf8') as file_: - for orth, count in counts: - string = string_map[orth] - file_.write('%d\t%s\n' % (count, string)) - - -@plac.annotations( - input_loc=("Location of input file list"), - freqs_dir=("Directory for frequency files"), - output_loc=("Location for output file"), - n_jobs=("Number of workers", "option", "n", int), - skip_existing=("Skip inputs where an output file exists", "flag", "s", bool), -) -def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False): - tasks = [] - outputs = [] - for input_path in open(input_loc): - input_path = input_path.strip() - if not input_path: - continue - filename = input_path.split('/')[-1] - output_path = path.join(freqs_dir, filename.replace('bz2', 'freq')) - outputs.append(output_path) - if not path.exists(output_path) or not skip_existing: - tasks.append((input_path, output_path)) - - if tasks: - parallelize(count_freqs, tasks, n_jobs) - - print("Merge") - merge_counts(outputs, output_loc) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/munge_ewtb.py b/bin/munge_ewtb.py deleted file mode 100755 index 4e21ceb07..000000000 --- a/bin/munge_ewtb.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -from __future__ import unicode_literals - -from xml.etree import cElementTree as ElementTree -import json -import re - -import plac -from pathlib import Path -from os import path - - -escaped_tokens = { - '-LRB-': '(', - '-RRB-': ')', - '-LSB-': '[', - '-RSB-': ']', - '-LCB-': '{', - '-RCB-': '}', -} - -def read_parses(parse_loc): - offset = 0 - doc = [] - for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'): - parse = _adjust_token_ids(parse, offset) - offset += len(parse.split('\n')) - doc.append(parse) - return doc - -def _adjust_token_ids(parse, offset): - output = [] - for line in parse.split('\n'): - pieces = line.split() - pieces[0] = str(int(pieces[0]) + offset) - pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0' - output.append('\t'.join(pieces)) - return '\n'.join(output) - - -def _fmt_doc(filename, paras): - return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]} - - -def _fmt_para(raw, sents): - return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]} - - -def _fmt_sent(sent): - return { - 'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')], - 'brackets': []} - - -def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3): - head = int(head) - 1 - id_ = int(id_) - 1 - head = (head - id_) if head != -1 else 0 - return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head} - - -tags_re = re.compile(r'<[\w\?/][^>]+>') -def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'): - ewtb_dir = Path(ewtb_dir) - out_dir = Path(out_dir) - if not out_dir.exists(): - out_dir.mkdir() - for genre_dir in ewtb_dir.joinpath('data').iterdir(): - #if 'answers' in str(genre_dir): continue - parse_dir = genre_dir.joinpath('penntree') - docs = [] - for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir(): - filename = source_loc.parts[-1].replace('.sgm.sgm', '') - filename = filename.replace('.xml', '') - filename = filename.replace('.txt', '') - parse_loc = parse_dir.joinpath(filename + '.xml.tree') - parses = read_parses(parse_loc) - source = source_loc.open().read().strip() - if 'answers' in str(genre_dir): - source = tags_re.sub('', source).strip() - docs.append(_fmt_doc(filename, [[source, parses]])) - - out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json') - with open(str(out_loc), 'w') as out_file: - out_file.write(json.dumps(docs, indent=4)) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/ner_tag.py b/bin/ner_tag.py deleted file mode 100644 index f990f21a1..000000000 --- a/bin/ner_tag.py +++ /dev/null @@ -1,32 +0,0 @@ -import io -import plac - -from spacy.en import English - - -def main(text_loc): - with io.open(text_loc, 'r', encoding='utf8') as file_: - text = file_.read() - NLU = English() - for paragraph in text.split('\n\n'): - tokens = NLU(paragraph) - - ent_starts = {} - ent_ends = {} - for span in tokens.ents: - ent_starts[span.start] = span.label_ - ent_ends[span.end] = span.label_ - - output = [] - for token in tokens: - if token.i in ent_starts: - output.append('<%s>' % ent_starts[token.i]) - output.append(token.orth_) - if (token.i+1) in ent_ends: - output.append('' % ent_ends[token.i+1]) - output.append('\n\n') - print ' '.join(output) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/conll_train.py b/bin/parser/conll_train.py deleted file mode 100755 index 8075dcd8a..000000000 --- a/bin/parser/conll_train.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import io -import random -import time -import gzip - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.en import English -from spacy.gold import GoldParse - -from spacy.syntax.util import Config -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.parser import Parser -from spacy.scorer import Scorer -from spacy.tagger import Tagger - -# Last updated for spaCy v0.97 - - -def read_conll(file_): - """Read a standard CoNLL/MALT-style format""" - sents = [] - for sent_str in file_.read().strip().split('\n\n'): - ids = [] - words = [] - heads = [] - labels = [] - tags = [] - for i, line in enumerate(sent_str.split('\n')): - word, pos_string, head_idx, label = _parse_line(line) - words.append(word) - if head_idx < 0: - head_idx = i - ids.append(i) - heads.append(head_idx) - labels.append(label) - tags.append(pos_string) - text = ' '.join(words) - annot = (ids, words, tags, heads, labels, ['O'] * len(ids)) - sents.append((None, [(annot, [])])) - return sents - - -def _parse_line(line): - pieces = line.split() - if len(pieces) == 4: - word, pos, head_idx, label = pieces - head_idx = int(head_idx) - elif len(pieces) == 15: - id_ = int(pieces[0].split('_')[-1]) - word = pieces[1] - pos = pieces[4] - head_idx = int(pieces[8])-1 - label = pieces[10] - else: - id_ = int(pieces[0].split('_')[-1]) - word = pieces[1] - pos = pieces[4] - head_idx = int(pieces[6])-1 - label = pieces[7] - if head_idx == 0: - label = 'ROOT' - return word, pos, head_idx, label - - -def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=False) - scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct')) - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, - gold_preproc=False, force_gold=False): - dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) - - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=ArcEager.get_labels(gold_tuples)) - - nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) - nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) - nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) - - print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for _, sents in gold_tuples: - for annot_tuples, _ in sents: - if len(annot_tuples[1]) == 1: - continue - - score_model(scorer, nlp, None, annot_tuples, verbose=False) - - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - if not gold.is_projective: - raise Exception( - "Non-projective sentence in training, after we should " - "have enforced projectivity: %s" % annot_tuples - ) - - loss += nlp.parser.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, - scorer.tags_acc, scorer.token_acc)) - print('end training') - nlp.end_training(model_dir) - print('done') - - -@plac.annotations( - train_loc=("Location of CoNLL 09 formatted training file"), - dev_loc=("Location of CoNLL 09 formatted development file"), - model_dir=("Location of output model directory"), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_iter=("Number of training iterations", "option", "i", int), -) -def main(train_loc, dev_loc, model_dir, n_iter=15): - with io.open(train_loc, 'r', encoding='utf8') as file_: - train_sents = read_conll(file_) - if not eval_only: - train(English, train_sents, model_dir, n_iter=n_iter) - nlp = English(data_dir=model_dir) - dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8')) - scorer = Scorer() - for _, sents in dev_sents: - for annot_tuples, _ in sents: - score_model(scorer, nlp, None, annot_tuples) - print('TOK', 100-scorer.token_acc) - print('POS', scorer.tags_acc) - print('UAS', scorer.uas) - print('LAS', scorer.las) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/train.py b/bin/parser/train.py deleted file mode 100755 index 26b545b6d..000000000 --- a/bin/parser/train.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals -from __future__ import print_function - -import os -from os import path -import shutil -import io -import random - -import plac -import re - -import spacy.util - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse -from spacy.gold import merge_sents - -from spacy.scorer import Scorer - -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.ner import BiluoPushDown -from spacy.tagger import Tagger -from spacy.syntax.parser import Parser -from spacy.syntax.nonproj import PseudoProjectivity - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c == ' ': - return '\n' - elif c == '\n': - return ' ' - elif c in ['.', "'", "!", "?"]: - return '' - else: - return c.lower() - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return ''.join(_corrupt(c, noise_level) for c in orig) - - -def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - else: - tokens = nlp.tokenizer(raw_text) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - - -def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, - n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): - print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") - format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' - with Language.train(model_dir, train_data, - tagger_cfg, parser_cfg, entity_cfg) as trainer: - loss = 0 - for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc, - augment_data=None)): - for doc, gold in epoch: - trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) - print(format_str.format(itn, trainer.nlp.parser.model.nr_weight, - trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores)) - - -def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, - beam_width=None, cand_preproc=None): - print("Load parser", model_dir) - nlp = Language(path=model_dir) - if nlp.lang == 'de': - nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - scorer = Scorer() - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - nlp.entity(tokens) - else: - tokens = nlp(raw_text) - gold = GoldParse.from_annot_tuples(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - return scorer - - -def write_parses(Language, dev_loc, model_dir, out_loc): - nlp = Language(data_dir=model_dir) - gold_tuples = read_json_file(dev_loc) - scorer = Scorer() - out_file = io.open(out_loc, 'w', 'utf8') - for raw_text, sents in gold_tuples: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text) - #gold = GoldParse(tokens, annot_tuples) - #scorer.score(tokens, gold, verbose=False) - for sent in tokens.sents: - for t in sent: - if not t.is_space: - out_file.write( - '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_) - ) - out_file.write('\n') - - -@plac.annotations( - language=("The language to train", "positional", None, str, ['en','de', 'zh']), - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - corruption_level=("Amount of noise to add to training data", "option", "c", float), - gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), - out_loc=("Out location", "option", "o", str), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), - pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool), - L1=("L1 regularization penalty", "option", "L", float), -) -def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False, - L1=1e-6): - parser_cfg = dict(locals()) - tagger_cfg = dict(locals()) - entity_cfg = dict(locals()) - - lang = spacy.util.get_lang_class(language) - - parser_cfg['features'] = lang.Defaults.parser_features - entity_cfg['features'] = lang.Defaults.entity_features - - if not eval_only: - gold_train = list(read_json_file(train_loc)) - gold_dev = list(read_json_file(dev_loc)) - if n_sents > 0: - gold_train = gold_train[:n_sents] - train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, - n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, - n_iter=n_iter) - if out_loc: - write_parses(lang, dev_loc, model_dir, out_loc) - scorer = evaluate(lang, list(read_json_file(dev_loc)), - model_dir, gold_preproc=gold_preproc, verbose=verbose) - print('TOK', scorer.token_acc) - print('POS', scorer.tags_acc) - print('UAS', scorer.uas) - print('LAS', scorer.las) - - print('NER P', scorer.ents_p) - print('NER R', scorer.ents_r) - print('NER F', scorer.ents_f) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py deleted file mode 100644 index 53ef906d5..000000000 --- a/bin/parser/train_ud.py +++ /dev/null @@ -1,201 +0,0 @@ -from __future__ import unicode_literals, print_function -import plac -import json -import random -import pathlib - -from spacy.tokens import Doc -from spacy.syntax.nonproj import PseudoProjectivity -from spacy.language import Language -from spacy.gold import GoldParse -from spacy.tagger import Tagger -from spacy.pipeline import DependencyParser, TokenVectorEncoder -from spacy.syntax.parser import get_templates -from spacy.syntax.arc_eager import ArcEager -from spacy.scorer import Scorer -from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP -import spacy.attrs -import io -from thinc.neural.ops import CupyOps -from thinc.neural import Model -from spacy.es import Spanish -from spacy.attrs import POS - - -from thinc.neural import Model - - -try: - import cupy - from thinc.neural.ops import CupyOps -except: - cupy = None - - -def read_conllx(loc, n=0): - with io.open(loc, 'r', encoding='utf8') as file_: - text = file_.read() - i = 0 - for sent in text.strip().split('\n\n'): - lines = sent.strip().split('\n') - if lines: - while lines[0].startswith('#'): - lines.pop(0) - tokens = [] - for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, \ - _2 = line.split('\t') - if '-' in id_ or '.' in id_: - continue - try: - id_ = int(id_) - 1 - head = (int(head) - 1) if head != '0' else id_ - dep = 'ROOT' if dep == 'root' else dep #'unlabelled' - tag = pos+'__'+dep+'__'+morph - Spanish.Defaults.tag_map[tag] = {POS: pos} - tokens.append((id_, word, tag, head, dep, 'O')) - except: - raise - tuples = [list(t) for t in zip(*tokens)] - yield (None, [[tuples, []]]) - i += 1 - if n >= 1 and i >= n: - break - - -def score_model(vocab, encoder, parser, Xs, ys, verbose=False): - scorer = Scorer() - correct = 0. - total = 0. - for doc, gold in zip(Xs, ys): - doc = Doc(vocab, words=[w.text for w in doc]) - encoder(doc) - parser(doc) - PseudoProjectivity.deprojectivize(doc) - scorer.score(doc, gold, verbose=verbose) - for token, tag in zip(doc, gold.tags): - if '_' in token.tag_: - univ_guess, _ = token.tag_.split('_', 1) - else: - univ_guess = '' - univ_truth, _ = tag.split('_', 1) - correct += univ_guess == univ_truth - total += 1 - return scorer - - -def organize_data(vocab, train_sents): - Xs = [] - ys = [] - for _, doc_sents in train_sents: - for (ids, words, tags, heads, deps, ner), _ in doc_sents: - doc = Doc(vocab, words=words) - gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) - Xs.append(doc) - ys.append(gold) - return Xs, ys - - -def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): - LangClass = spacy.util.get_lang_class(lang_name) - train_sents = list(read_conllx(train_loc)) - dev_sents = list(read_conllx(dev_loc)) - train_sents = PseudoProjectivity.preprocess_training_data(train_sents) - - actions = ArcEager.get_actions(gold_parses=train_sents) - features = get_templates('basic') - - model_dir = pathlib.Path(model_dir) - if not model_dir.exists(): - model_dir.mkdir() - if not (model_dir / 'deps').exists(): - (model_dir / 'deps').mkdir() - if not (model_dir / 'pos').exists(): - (model_dir / 'pos').mkdir() - with (model_dir / 'deps' / 'config.json').open('wb') as file_: - file_.write( - json.dumps( - {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) - - vocab = LangClass.Defaults.create_vocab() - if not (model_dir / 'vocab').exists(): - (model_dir / 'vocab').mkdir() - else: - if (model_dir / 'vocab' / 'strings.json').exists(): - with (model_dir / 'vocab' / 'strings.json').open() as file_: - vocab.strings.load(file_) - if (model_dir / 'vocab' / 'lexemes.bin').exists(): - vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') - - if clusters_loc is not None: - clusters_loc = pathlib.Path(clusters_loc) - with clusters_loc.open() as file_: - for line in file_: - try: - cluster, word, freq = line.split() - except ValueError: - continue - lex = vocab[word] - lex.cluster = int(cluster[::-1], 2) - # Populate vocab - for _, doc_sents in train_sents: - for (ids, words, tags, heads, deps, ner), _ in doc_sents: - for word in words: - _ = vocab[word] - for dep in deps: - _ = vocab[dep] - for tag in tags: - _ = vocab[tag] - if vocab.morphology.tag_map: - for tag in tags: - vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]} - tagger = Tagger(vocab) - encoder = TokenVectorEncoder(vocab, width=64) - parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) - - Xs, ys = organize_data(vocab, train_sents) - dev_Xs, dev_ys = organize_data(vocab, dev_sents) - with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer): - docs = list(Xs) - for doc in docs: - encoder(doc) - nn_loss = [0.] - def track_progress(): - with encoder.tagger.use_params(optimizer.averages): - with parser.model.use_params(optimizer.averages): - scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys) - itn = len(nn_loss) - print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc)) - nn_loss.append(0.) - track_progress() - trainer.each_epoch.append(track_progress) - trainer.batch_size = 24 - trainer.nb_epoch = 40 - for docs, golds in trainer.iterate(Xs, ys, progress_bar=True): - docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs] - tokvecs, upd_tokvecs = encoder.begin_update(docs) - for doc, tokvec in zip(docs, tokvecs): - doc.tensor = tokvec - d_tokvecs = parser.update(docs, golds, sgd=optimizer) - upd_tokvecs(d_tokvecs, sgd=optimizer) - encoder.update(docs, golds, sgd=optimizer) - nlp = LangClass(vocab=vocab, parser=parser) - scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc)) - print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) - #nlp.end_training(model_dir) - #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) - #print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) - - -if __name__ == '__main__': - import cProfile - import pstats - if 1: - plac.call(main) - else: - cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() - - - plac.call(main) diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py deleted file mode 100644 index f9f4eec21..000000000 --- a/bin/prepare_treebank.py +++ /dev/null @@ -1,194 +0,0 @@ -"""Convert OntoNotes into a json format. - -doc: { - id: string, - paragraphs: [{ - raw: string, - sents: [int], - tokens: [{ - start: int, - tag: string, - head: int, - dep: string}], - ner: [{ - start: int, - end: int, - label: string}], - brackets: [{ - start: int, - end: int, - label: string}]}]} - -Consumes output of spacy/munge/align_raw.py -""" -from __future__ import unicode_literals -import plac -import json -from os import path -import os -import re -import io -from collections import defaultdict - -from spacy.munge import read_ptb -from spacy.munge import read_conll -from spacy.munge import read_ner - - -def _iter_raw_files(raw_loc): - files = json.load(open(raw_loc)) - for f in files: - yield f - - -def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): - ptb_sents = read_ptb.split(ptb_text) - dep_sents = read_conll.split(dep_text) - if len(ptb_sents) != len(dep_sents): - return None - if ner_text is not None: - ner_sents = read_ner.split(ner_text) - else: - ner_sents = [None] * len(ptb_sents) - - i = 0 - doc = {'id': file_id} - if raw_paras is None: - doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)] - #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents): - # doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent])) - else: - doc['paragraphs'] = [] - for raw_sents in raw_paras: - para = format_para( - ' '.join(raw_sents).replace('', ''), - ptb_sents[i:i+len(raw_sents)], - dep_sents[i:i+len(raw_sents)], - ner_sents[i:i+len(raw_sents)]) - if para['sentences']: - doc['paragraphs'].append(para) - i += len(raw_sents) - return doc - - -def format_para(raw_text, ptb_sents, dep_sents, ner_sents): - para = {'raw': raw_text, 'sentences': []} - offset = 0 - assert len(ptb_sents) == len(dep_sents) == len(ner_sents) - for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents): - _, deps = read_conll.parse(dep_text, strip_bad_periods=True) - if deps and 'VERB' in [t['tag'] for t in deps]: - continue - if ner_text is not None: - _, ner = read_ner.parse(ner_text, strip_bad_periods=True) - else: - ner = ['-' for _ in deps] - _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True) - # Necessary because the ClearNLP converter deletes EDITED words. - if len(ner) != len(deps): - ner = ['-' for _ in deps] - para['sentences'].append(format_sentence(deps, ner, brackets)) - return para - - -def format_sentence(deps, ner, brackets): - sent = {'tokens': [], 'brackets': []} - for token_id, (token, token_ent) in enumerate(zip(deps, ner)): - sent['tokens'].append(format_token(token_id, token, token_ent)) - - for label, start, end in brackets: - if start != end: - sent['brackets'].append({ - 'label': label, - 'first': start, - 'last': (end-1)}) - return sent - - -def format_token(token_id, token, ner): - assert token_id == token['id'] - head = (token['head'] - token_id) if token['head'] != -1 else 0 - return { - 'id': token_id, - 'orth': token['word'], - 'tag': token['tag'], - 'head': head, - 'dep': token['dep'], - 'ner': ner} - - -def read_file(*pieces): - loc = path.join(*pieces) - if not path.exists(loc): - return None - else: - return io.open(loc, 'r', encoding='utf8').read().strip() - - -def get_file_names(section_dir, subsection): - filenames = [] - for fn in os.listdir(path.join(section_dir, subsection)): - filenames.append(fn.rsplit('.', 1)[0]) - return list(sorted(set(filenames))) - - -def read_wsj_with_source(onto_dir, raw_dir): - # Now do WSJ, with source alignment - onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj') - docs = {} - for i in range(25): - section = str(i) if i >= 10 else ('0' + str(i)) - raw_loc = path.join(raw_dir, 'wsj%s.json' % section) - for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)): - if section == '00': - j += 1 - if section == '04' and filename == '55': - continue - ptb = read_file(onto_dir, section, '%s.parse' % filename) - dep = read_file(onto_dir, section, '%s.parse.dep' % filename) - ner = read_file(onto_dir, section, '%s.name' % filename) - if ptb is not None and dep is not None: - docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner) - return docs - - -def get_doc(onto_dir, file_path, wsj_docs): - filename = file_path.rsplit('/', 1)[1] - if filename in wsj_docs: - return wsj_docs[filename] - else: - ptb = read_file(onto_dir, file_path + '.parse') - dep = read_file(onto_dir, file_path + '.parse.dep') - ner = read_file(onto_dir, file_path + '.name') - if ptb is not None and dep is not None: - return format_doc(filename, None, ptb, dep, ner) - else: - return None - - -def read_ids(loc): - return open(loc).read().strip().split('\n') - - -def main(onto_dir, raw_dir, out_dir): - wsj_docs = read_wsj_with_source(onto_dir, raw_dir) - - for partition in ('train', 'test', 'development'): - ids = read_ids(path.join(onto_dir, '%s.id' % partition)) - docs_by_genre = defaultdict(list) - for file_path in ids: - doc = get_doc(onto_dir, file_path, wsj_docs) - if doc is not None: - genre = file_path.split('/')[3] - docs_by_genre[genre].append(doc) - part_dir = path.join(out_dir, partition) - if not path.exists(part_dir): - os.mkdir(part_dir) - for genre, docs in sorted(docs_by_genre.items()): - out_loc = path.join(part_dir, genre + '.json') - with open(out_loc, 'w') as file_: - json.dump(docs, file_, indent=4) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/prepare_vecs.py b/bin/prepare_vecs.py deleted file mode 100644 index b55dafee3..000000000 --- a/bin/prepare_vecs.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Read a vector file, and prepare it as binary data, for easy consumption""" - -import plac - -from spacy.vocab import write_binary_vectors - - -def main(in_loc, out_loc): - write_binary_vectors(in_loc, out_loc) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/tagger/train.py b/bin/tagger/train.py deleted file mode 100755 index 9cd8cc011..000000000 --- a/bin/tagger/train.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals -from __future__ import print_function - -import os -from os import path -import shutil -import codecs -import random - -import plac -import re - -import spacy.util -from spacy.en import English - -from spacy.tagger import Tagger - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse - -from spacy.scorer import Scorer - - -def score_model(scorer, nlp, raw_text, annot_tuples): - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - else: - tokens = nlp.tokenizer(raw_text) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold) - - -def _merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), brackets in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) - i += len(ids) - return [(m_deps, m_brackets)] - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - seed=0, gold_preproc=False, n_sents=0, corruption_level=0, - beam_width=1, verbose=False, - use_orig_arc_eager=False): - if n_sents > 0: - gold_tuples = gold_tuples[:n_sents] - - templates = Tagger.default_templates() - nlp = Language(data_dir=model_dir, tagger=False) - nlp.tagger = Tagger.blank(nlp.vocab, templates) - - print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) - for annot_tuples, ctnt in sents: - words = annot_tuples[1] - gold_tags = annot_tuples[2] - score_model(scorer, nlp, raw_text, annot_tuples) - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(words) - else: - tokens = nlp.tokenizer(raw_text) - loss += nlp.tagger.train(tokens, gold_tags) - random.shuffle(gold_tuples) - print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, - scorer.tags_acc, - scorer.token_acc)) - nlp.end_training(model_dir) - -def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, - beam_width=None): - nlp = Language(data_dir=model_dir) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - scorer = Scorer() - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text, merge_mwes=False) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - return scorer - - -def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): - nlp = Language(data_dir=model_dir) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - gold_tuples = read_json_file(dev_loc) - scorer = Scorer() - out_file = codecs.open(out_loc, 'w', 'utf8') - for raw_text, sents in gold_tuples: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text, merge_mwes=False) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=False) - for t in tokens: - out_file.write( - '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) - ) - return scorer - - -@plac.annotations( - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - corruption_level=("Amount of noise to add to training data", "option", "c", float), - gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), - out_loc=("Out location", "option", "o", str), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), -) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): - if not eval_only: - gold_train = list(read_json_file(train_loc)) - train(English, gold_train, model_dir, - feat_set='basic' if not debug else 'debug', - gold_preproc=gold_preproc, n_sents=n_sents, - corruption_level=corruption_level, n_iter=n_iter, - verbose=verbose) - #if out_loc: - # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) - scorer = evaluate(English, list(read_json_file(dev_loc)), - model_dir, gold_preproc=gold_preproc, verbose=verbose) - print('TOK', scorer.token_acc) - print('POS', scorer.tags_acc) - print('UAS', scorer.uas) - print('LAS', scorer.las) - - print('NER P', scorer.ents_p) - print('NER R', scorer.ents_r) - print('NER F', scorer.ents_f) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/tagger/train_german_tagger.py b/bin/tagger/train_german_tagger.py deleted file mode 100644 index 4927a6e9a..000000000 --- a/bin/tagger/train_german_tagger.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import io -import random -import time -import gzip -import ujson - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.de import German -from spacy.gold import GoldParse -from spacy.tagger import Tagger -from spacy.scorer import PRFScore - -from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags -from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags -from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags -from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags -from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS - - -def default_templates(): - return spacy.tagger.Tagger.default_templates() - -def default_templates_without_clusters(): - return ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), - ) - - -def make_tagger(vocab, templates): - model = spacy.tagger.TaggerModel(templates) - return spacy.tagger.Tagger(vocab,model) - - -def read_conll(file_): - def sentences(): - words, tags = [], [] - for line in file_: - line = line.strip() - if line: - word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09) - words.append(word) - tags.append(tag) - elif words: - yield words, tags - words, tags = [], [] - if words: - yield words, tags - return [ s for s in sentences() ] - - -def score_model(score, nlp, words, gold_tags): - tokens = nlp.tokenizer.tokens_from_list(words) - assert(len(tokens) == len(gold_tags)) - nlp.tagger(tokens) - - for token, gold_tag in zip(tokens,gold_tags): - score.score_set(set([token.tag_]),set([gold_tag])) - - -def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21): - # make shuffling deterministic - random.seed(seed) - - # set up directory for model - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(pos_model_dir) - - nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) - nlp.tagger = make_tagger(nlp.vocab,default_templates()) - - print("Itn.\ttrain acc %\tdev acc %") - for itn in range(n_iter): - # train on train set - #train_acc = PRFScore() - correct, total = 0., 0. - for words, gold_tags in train_sents: - tokens = nlp.tokenizer.tokens_from_list(words) - correct += nlp.tagger.train(tokens, gold_tags) - total += len(words) - train_acc = correct/total - - # test on dev set - dev_acc = PRFScore() - for words, gold_tags in dev_sents: - score_model(dev_acc, nlp, words, gold_tags) - - random.shuffle(train_sents) - print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision)) - - - print('end training') - nlp.end_training(model_dir) - print('done') - - -@plac.annotations( - train_loc=("Location of CoNLL 09 formatted training file"), - dev_loc=("Location of CoNLL 09 formatted development file"), - model_dir=("Location of output model directory"), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_iter=("Number of training iterations", "option", "i", int), -) -def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15): - # training - if not eval_only: - with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \ - io.open(dev_loc, 'r', encoding='utf8') as devfile_: - train_sents = read_conll(trainfile_) - dev_sents = read_conll(devfile_) - train(German, train_sents, dev_sents, model_dir, n_iter=n_iter) - - # testing - with io.open(dev_loc, 'r', encoding='utf8') as file_: - dev_sents = read_conll(file_) - nlp = German(data_dir=model_dir) - - dev_acc = PRFScore() - for words, gold_tags in dev_sents: - score_model(dev_acc, nlp, words, gold_tags) - - print('POS: %6.2f %%' % (100*dev_acc.precision)) - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/README.md b/examples/README.md index d7168f613..18a1760ec 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,20 +2,18 @@ # spaCy examples -The examples are Python scripts with well-behaved command line interfaces. For a full list of spaCy tutorials and code snippets, see the [documentation](https://spacy.io/docs/usage/tutorials). +The examples are Python scripts with well-behaved command line interfaces. For +more detailed usage guides, see the [documentation](https://alpha.spacy.io/usage/). -## How to run an example - -For example, to run the [`nn_text_class.py`](nn_text_class.py) script, do: +To see the available arguments, you can use the `--help` or `-h` flag: ```bash -$ python examples/nn_text_class.py -usage: nn_text_class.py [-h] [-d 3] [-H 300] [-i 5] [-w 40000] [-b 24] - [-r 0.3] [-p 1e-05] [-e 0.005] - data_dir -nn_text_class.py: error: too few arguments +$ python examples/training/train_ner.py --help ``` -You can print detailed help with the `-h` argument. - -While we try to keep the examples up to date, they are not currently exercised by the test suite, as some of them require significant data downloads or take time to train. If you find that an example is no longer running, [please tell us](https://github.com/explosion/spaCy/issues)! We know there's nothing worse than trying to figure out what you're doing wrong, and it turns out your code was never the problem. +While we try to keep the examples up to date, they are not currently exercised +by the test suite, as some of them require significant data downloads or take +time to train. If you find that an example is no longer running, +[please tell us](https://github.com/explosion/spaCy/issues)! We know there's +nothing worse than trying to figure out what you're doing wrong, and it turns +out your code was never the problem. diff --git a/examples/_handler.py b/examples/_handler.py deleted file mode 100644 index cebfe8968..000000000 --- a/examples/_handler.py +++ /dev/null @@ -1,37 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - -from math import sqrt -from numpy import dot -from numpy.linalg import norm - - -def handle_tweet(spacy, tweet_data, query): - text = tweet_data.get('text', u'') - # Twython returns either bytes or unicode, depending on tweet. - # ಠ_ಠ #APIshaming - try: - match_tweet(spacy, text, query) - except TypeError: - match_tweet(spacy, text.decode('utf8'), query) - - -def match_tweet(spacy, text, query): - def get_vector(word): - return spacy.vocab[word].repvec - - tweet = spacy(text) - tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query] - if tweet: - accept = map(get_vector, 'child classroom teach'.split()) - reject = map(get_vector, 'mouth hands giveaway'.split()) - - y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept) - n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject) - - if (y / (y + n)) >= 0.5 or True: - print(text) - - -def cos(v1, v2): - return dot(v1, v2) / (norm(v1) * norm(v2)) diff --git a/examples/chainer_sentiment.py b/examples/chainer_sentiment.py deleted file mode 100644 index 747ef508a..000000000 --- a/examples/chainer_sentiment.py +++ /dev/null @@ -1,322 +0,0 @@ -'''WIP --- Doesn't work well yet''' -import plac -import random -import six - -import cProfile -import pstats - -import pathlib -import cPickle as pickle -from itertools import izip - -import spacy - -import cytoolz -import cupy as xp -import cupy.cuda -import chainer.cuda - -import chainer.links as L -import chainer.functions as F -from chainer import Chain, Variable, report -import chainer.training -import chainer.optimizers -from chainer.training import extensions -from chainer.iterators import SerialIterator -from chainer.datasets import TupleDataset - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - raise NotImplementedError - #with (path / 'config.json').open() as file_: - # model = model_from_json(file_.read()) - #with (path / 'model').open('rb') as file_: - # lstm_weights = pickle.load(file_) - #embeddings = get_embeddings(nlp.vocab) - #model.set_weights([embeddings] + lstm_weights) - #return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000, n_threads=2): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -class Classifier(Chain): - def __init__(self, predictor): - super(Classifier, self).__init__(predictor=predictor) - - def __call__(self, x, t): - y = self.predictor(x) - loss = F.softmax_cross_entropy(y, t) - accuracy = F.accuracy(y, t) - report({'loss': loss, 'accuracy': accuracy}, self) - return loss - - -class SentimentModel(Chain): - def __init__(self, nlp, shape, **settings): - Chain.__init__(self, - embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], - set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), - encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), - attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), - predict=_Predict(shape['nr_hidden'], shape['nr_class'])) - self.to_gpu(0) - - def __call__(self, sentence): - return self.predict( - self.attend( - self.encode( - self.embed(sentence)))) - - -class _Embed(Chain): - def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): - Chain.__init__(self, - embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), - project=L.Linear(None, nr_out, nobias=True)) - self.embed.W.volatile = False - - def __call__(self, sentence): - return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] - - -class _Encode(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - fwd=L.LSTM(nr_in, nr_out), - bwd=L.LSTM(nr_in, nr_out), - mix=L.Bilinear(nr_out, nr_out, nr_out)) - - def __call__(self, sentence): - self.fwd.reset_state() - fwds = map(self.fwd, sentence) - self.bwd.reset_state() - bwds = reversed(map(self.bwd, reversed(sentence))) - return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] - - -class _Attend(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self) - - def __call__(self, sentence): - sent = sum(sentence) - return sent - - -class _Predict(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - l1=L.Linear(nr_in, nr_in), - l2=L.Linear(nr_in, nr_out)) - - def __call__(self, vector): - vector = self.l1(vector) - vector = F.elu(vector) - vector = self.l2(vector) - return vector - - -class SentenceDataset(TupleDataset): - def __init__(self, nlp, texts, labels, max_length): - self.max_length = max_length - sents, labels = self._get_labelled_sentences( - nlp.pipe(texts, batch_size=5000, n_threads=3), - labels) - TupleDataset.__init__(self, - get_features(sents, max_length), - labels) - - def __getitem__(self, index): - batches = [dataset[index] for dataset in self._datasets] - if isinstance(index, slice): - length = len(batches[0]) - returns = [tuple([batch[i] for batch in batches]) - for i in six.moves.range(length)] - return returns - else: - return tuple(batches) - - def _get_labelled_sentences(self, docs, doc_labels): - labels = [] - sentences = [] - for doc, y in izip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, xp.asarray(labels, dtype='i') - - -class DocDataset(TupleDataset): - def __init__(self, nlp, texts, labels): - self.max_length = max_length - DatasetMixin.__init__(self, - get_features( - nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), - labels) - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -def get_features(docs, max_length): - docs = list(docs) - Xs = xp.zeros((len(docs), max_length), dtype='i') - for i, doc in enumerate(docs): - j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.norm - j += 1 - if j >= max_length: - break - return Xs - - -def set_vectors(vectors, vocab): - for lex in vocab: - if lex.has_vector and (lex.rank+1) < vectors.shape[0]: - lex.norm = lex.rank+1 - vectors[lex.rank + 1] = lex.vector - else: - lex.norm = 0 - return vectors - - -def train(train_texts, train_labels, dev_texts, dev_labels, - lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, - by_sentence=True): - nlp = spacy.load('en', entity=False) - if 'nr_vector' not in lstm_shape: - lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) - if 'nr_dim' not in lstm_shape: - lstm_shape['nr_dim'] = nlp.vocab.vectors_length - print("Make model") - model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) - print("Parsing texts...") - if by_sentence: - train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) - dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) - else: - train_data = DocDataset(nlp, train_texts, train_labels) - dev_data = DocDataset(nlp, dev_texts, dev_labels) - train_iter = SerialIterator(train_data, batch_size=batch_size, - shuffle=True, repeat=True) - dev_iter = SerialIterator(dev_data, batch_size=batch_size, - shuffle=False, repeat=False) - optimizer = chainer.optimizers.Adam() - optimizer.setup(model) - updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) - trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') - - trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) - trainer.extend(extensions.LogReport()) - trainer.extend(extensions.PrintReport([ - 'epoch', 'main/accuracy', 'validation/main/accuracy'])) - trainer.extend(extensions.ProgressBar()) - - trainer.run() - - -def evaluate(model_dir, texts, labels, max_length=100): - def create_pipeline(nlp): - ''' - This could be a lambda, but named functions are easier to read in Python. - ''' - return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, - max_length=max_length)] - - nlp = spacy.load('en') - nlp.pipeline = create_pipeline(nlp) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int) -) -def main(model_dir, train_dir, dev_dir, - is_runtime=False, - nr_hidden=64, max_length=100, # Shape - dropout=0.5, learn_rate=0.001, # General NN config - nb_epoch=5, batch_size=32, nr_examples=-1): # Training params - model_dir = pathlib.Path(model_dir) - train_dir = pathlib.Path(train_dir) - dev_dir = pathlib.Path(dev_dir) - if is_runtime: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) - print("Using GPU 0") - #chainer.cuda.get_device(0).use() - train_labels = xp.asarray(train_labels, dtype='i') - dev_labels = xp.asarray(dev_labels, dtype='i') - lstm = train(train_texts, train_labels, dev_texts, dev_labels, - {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, - 'nr_vector': 5000}, - {'dropout': 0.5, 'lr': learn_rate}, - {}, - nb_epoch=nb_epoch, batch_size=batch_size) - - -if __name__ == '__main__': - #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - #s = pstats.Stats("Profile.prof") - #s.strip_dirs().sort_stats("time").print_stats() - plac.call(main) diff --git a/examples/get_parse_subregions.py b/examples/get_parse_subregions.py deleted file mode 100644 index 5eb4f2c77..000000000 --- a/examples/get_parse_subregions.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Issue #252 - -Question: - -In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat. - -Lets take the example sentence on https://displacy.spacy.io/displacy/index.html - -displaCy uses CSS and JavaScript to show you how computers understand language -This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: - -[displaCy] uses CSS and Javascript [to + show] -& -show you how computers understand [language] -I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function. - -def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels -""" -from __future__ import print_function, unicode_literals - -# Answer: -# The easiest way is to find the head of the subtree you want, and then use the -# `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the -# one that does what you're asking for most directly: - -from spacy.en import English -nlp = English() - -doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') -for word in doc: - if word.dep_ in ('xcomp', 'ccomp'): - print(''.join(w.text_with_ws for w in word.subtree)) - -# It'd probably be better for `word.subtree` to return a `Span` object instead -# of a generator over the tokens. If you want the `Span` you can get it via the -# `.right_edge` and `.left_edge` properties. The `Span` object is nice because -# you can easily get a vector, merge it, etc. - -doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') -for word in doc: - if word.dep_ in ('xcomp', 'ccomp'): - subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] - print(subtree_span.text, '|', subtree_span.root.text) - print(subtree_span.similarity(doc)) - print(subtree_span.similarity(subtree_span.root)) - - -# You might also want to select a head, and then select a start and end position by -# walking along its children. You could then take the `.left_edge` and `.right_edge` -# of those tokens, and use it to calculate a span. - - - diff --git a/examples/information_extraction.py b/examples/information_extraction.py deleted file mode 100644 index 19e93b499..000000000 --- a/examples/information_extraction.py +++ /dev/null @@ -1,59 +0,0 @@ -import plac - -from spacy.en import English -from spacy.parts_of_speech import NOUN -from spacy.parts_of_speech import ADP as PREP - - -def _span_to_tuple(span): - start = span[0].idx - end = span[-1].idx + len(span[-1]) - tag = span.root.tag_ - text = span.text - label = span.label_ - return (start, end, tag, text, label) - -def merge_spans(spans, doc): - # This is a bit awkward atm. What we're doing here is merging the entities, - # so that each only takes up a single token. But an entity is a Span, and - # each Span is a view into the doc. When we merge a span, we invalidate - # the other spans. This will get fixed --- but for now the solution - # is to gather the information first, before merging. - tuples = [_span_to_tuple(span) for span in spans] - for span_tuple in tuples: - doc.merge(*span_tuple) - - -def extract_currency_relations(doc): - merge_spans(doc.ents, doc) - merge_spans(doc.noun_chunks, doc) - - relations = [] - for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): - if money.dep_ in ('attr', 'dobj'): - subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] - if subject: - subject = subject[0] - relations.append((subject, money)) - elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': - relations.append((money.head.head, money)) - - return relations - - -def main(): - nlp = English() - texts = [ - u'Net income was $9.4 million compared to the prior year of $2.7 million.', - u'Revenue exceeded twelve billion dollars, with a loss of $1b.', - ] - - for text in texts: - doc = nlp(text) - relations = extract_currency_relations(doc) - for r1, r2 in relations: - print(r1.text, r2.ent_type_, r2.text) - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py new file mode 100644 index 000000000..b73dcbf3b --- /dev/null +++ b/examples/information_extraction/entity_relations.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# coding: utf8 +""" +A simple example of extracting relations between phrases and entities using +spaCy's named entity recognizer and the dependency parse. Here, we extract +money and currency values (entities labelled as MONEY) and then check the +dependency tree to find the noun phrase they are referring to – for example: +$9.4 million --> Net income. + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +import spacy + + +TEXTS = [ + 'Net income was $9.4 million compared to the prior year of $2.7 million.', + 'Revenue exceeded twelve billion dollars, with a loss of $1b.', +] + + +@plac.annotations( + model=("Model to load (needs parser and NER)", "positional", None, str)) +def main(model='en_core_web_sm'): + nlp = spacy.load(model) + print("Loaded model '%s'" % model) + print("Processing %d texts" % len(TEXTS)) + + for text in TEXTS: + doc = nlp(text) + relations = extract_currency_relations(doc) + for r1, r2 in relations: + print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text)) + + +def extract_currency_relations(doc): + # merge entities and noun chunks into one token + for span in [*list(doc.ents), *list(doc.noun_chunks)]: + span.merge() + + relations = [] + for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): + if money.dep_ in ('attr', 'dobj'): + subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] + if subject: + subject = subject[0] + relations.append((subject, money)) + elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': + relations.append((money.head.head, money)) + return relations + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # Net income MONEY $9.4 million + # the prior year MONEY $2.7 million + # Revenue MONEY twelve billion dollars + # a loss MONEY 1b diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py new file mode 100644 index 000000000..5963d014c --- /dev/null +++ b/examples/information_extraction/parse_subtrees.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# coding: utf8 +""" +This example shows how to navigate the parse tree including subtrees attached +to a word. + +Based on issue #252: +"In the documents and tutorials the main thing I haven't found is +examples on how to break sentences down into small sub thoughts/chunks. The +noun_chunks is handy, but having examples on using the token.head to find small +(near-complete) sentence chunks would be neat. Lets take the example sentence: +"displaCy uses CSS and JavaScript to show you how computers understand language" + +This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: +[displaCy] uses CSS and Javascript [to + show] +show you how computers understand [language] + +I'm assuming that we can use the token.head to build these groups." + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +import spacy + + +@plac.annotations( + model=("Model to load", "positional", None, str)) +def main(model='en_core_web_sm'): + nlp = spacy.load(model) + print("Loaded model '%s'" % model) + + doc = nlp("displaCy uses CSS and JavaScript to show you how computers " + "understand language") + + # The easiest way is to find the head of the subtree you want, and then use + # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` + # is the one that does what you're asking for most directly: + for word in doc: + if word.dep_ in ('xcomp', 'ccomp'): + print(''.join(w.text_with_ws for w in word.subtree)) + + # It'd probably be better for `word.subtree` to return a `Span` object + # instead of a generator over the tokens. If you want the `Span` you can + # get it via the `.right_edge` and `.left_edge` properties. The `Span` + # object is nice because you can easily get a vector, merge it, etc. + for word in doc: + if word.dep_ in ('xcomp', 'ccomp'): + subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] + print(subtree_span.text, '|', subtree_span.root.text) + + # You might also want to select a head, and then select a start and end + # position by walking along its children. You could then take the + # `.left_edge` and `.right_edge` of those tokens, and use it to calculate + # a span. + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # to show you how computers understand language + # how computers understand language + # to show you how computers understand language | show + # how computers understand language | understand diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py new file mode 100644 index 000000000..2dd2691b9 --- /dev/null +++ b/examples/information_extraction/phrase_matcher.py @@ -0,0 +1,104 @@ +"""Match a large set of multi-word expressions in O(1) time. + +The idea is to associate each word in the vocabulary with a tag, noting whether +they begin, end, or are inside at least one pattern. An additional tag is used +for single-word patterns. Complete patterns are also stored in a hash set. + +When we process a document, we look up the words in the vocabulary, to +associate the words with the tags. We then search for tag-sequences that +correspond to valid candidates. Finally, we look up the candidates in the hash +set. + +For instance, to search for the phrases "Barack Hussein Obama" and "Hilary +Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with +the I tag, and Obama and Clinton with the L tag. + +The document "Barack Clinton and Hilary Clinton" would have the tag sequence +[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second +candidate is in the phrase dictionary, so only one is returned as a match. + +The algorithm is O(n) at run-time for document of length n because we're only +ever matching over the tag patterns. So no matter how many phrases we're +looking for, our pattern set stays very small (exact size depends on the +maximum length we're looking for, as the query language currently has no +quantifiers). + +The example expects a .bz2 file from the Reddit corpus, and a patterns file, +formatted in jsonl as a sequence of entries like this: + +{"text":"Anchorage"} +{"text":"Angola"} +{"text":"Ann Arbor"} +{"text":"Annapolis"} +{"text":"Appalachia"} +{"text":"Argentina"} +""" +from __future__ import print_function, unicode_literals, division + +from bz2 import BZ2File +import time +import plac +import ujson + +from spacy.matcher import PhraseMatcher +import spacy + + +@plac.annotations( + patterns_loc=("Path to gazetteer", "positional", None, str), + text_loc=("Path to Reddit corpus file", "positional", None, str), + n=("Number of texts to read", "option", "n", int), + lang=("Language class to initialise", "option", "l", str)) +def main(patterns_loc, text_loc, n=10000, lang='en'): + nlp = spacy.blank('en') + nlp.vocab.lex_attr_getters = {} + phrases = read_gazetteer(nlp.tokenizer, patterns_loc) + count = 0 + t1 = time.time() + for ent_id, text in get_matches(nlp.tokenizer, phrases, + read_text(text_loc, n=n)): + count += 1 + t2 = time.time() + print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) + + +def read_gazetteer(tokenizer, loc, n=-1): + for i, line in enumerate(open(loc)): + data = ujson.loads(line.strip()) + phrase = tokenizer(data['text']) + for w in phrase: + _ = tokenizer.vocab[w.text] + if len(phrase) >= 2: + yield phrase + + +def read_text(bz2_loc, n=10000): + with BZ2File(bz2_loc) as file_: + for i, line in enumerate(file_): + data = ujson.loads(line) + yield data['body'] + if i >= n: + break + + +def get_matches(tokenizer, phrases, texts, max_length=6): + matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) + matcher.add('Phrase', None, *phrases) + for text in texts: + doc = tokenizer(text) + for w in doc: + _ = doc.vocab[w.text] + matches = matcher(doc) + for ent_id, start, end in matches: + yield (ent_id, doc[start:end].text) + + +if __name__ == '__main__': + if False: + import cProfile + import pstats + cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") + s = pstats.Stats("Profile.prof") + s.strip_dirs().sort_stats("time").print_stats() + else: + plac.call(main) diff --git a/examples/inventory_count/Instructions.md b/examples/inventory_count/Instructions.md deleted file mode 100644 index 456f5d4fe..000000000 --- a/examples/inventory_count/Instructions.md +++ /dev/null @@ -1,5 +0,0 @@ -An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module. - -In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement). - - diff --git a/examples/inventory_count/inventory.py b/examples/inventory_count/inventory.py deleted file mode 100644 index abc031513..000000000 --- a/examples/inventory_count/inventory.py +++ /dev/null @@ -1,35 +0,0 @@ -class Inventory: - """ - Inventory class - a struct{} like feature to house inventory counts - across modules. - """ - originalQuery = None - item = "" - unit = "" - amount = "" - - def __init__(self, statement): - """ - Constructor - only takes in the original query/statement - :return: new Inventory object - """ - - self.originalQuery = statement - pass - - def __str__(self): - return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item) - - def printInfo(self): - print '-------------Inventory Count------------' - print "Original Query: " + str(self.originalQuery) - print 'Amount: ' + str(self.amount) - print 'Unit: ' + str(self.unit) - print 'Item: ' + str(self.item) - print '----------------------------------------' - - def isValid(self): - if not self.item or not self.unit or not self.amount or not self.originalQuery: - return False - else: - return True diff --git a/examples/inventory_count/inventoryCount.py b/examples/inventory_count/inventoryCount.py deleted file mode 100644 index b1b7b43c8..000000000 --- a/examples/inventory_count/inventoryCount.py +++ /dev/null @@ -1,92 +0,0 @@ -from inventory import Inventory - - -def runTest(nlp): - testset = [] - testset += [nlp(u'6 lobster cakes')] - testset += [nlp(u'6 avacados')] - testset += [nlp(u'fifty five carrots')] - testset += [nlp(u'i have 55 carrots')] - testset += [nlp(u'i got me some 9 cabbages')] - testset += [nlp(u'i got 65 kgs of carrots')] - - result = [] - for doc in testset: - c = decodeInventoryEntry_level1(doc) - if not c.isValid(): - c = decodeInventoryEntry_level2(doc) - result.append(c) - - for i in result: - i.printInfo() - - -def decodeInventoryEntry_level1(document): - """ - Decodes a basic entry such as: '6 lobster cake' or '6' cakes - @param document : NLP Doc object - :return: Status if decoded correctly (true, false), and Inventory object - """ - count = Inventory(str(document)) - for token in document: - if token.pos_ == (u'NOUN' or u'NNS' or u'NN'): - item = str(token) - - for child in token.children: - if child.dep_ == u'compound' or child.dep_ == u'ad': - item = str(child) + str(item) - elif child.dep_ == u'nummod': - count.amount = str(child).strip() - for numerical_child in child.children: - # this isn't arithmetic rather than treating it such as a string - count.amount = str(numerical_child) + str(count.amount).strip() - else: - print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_) - - count.item = item - count.unit = item - - return count - - -def decodeInventoryEntry_level2(document): - """ - Entry level 2, a more complicated parsing scheme that covers examples such as - 'i have 80 boxes of freshly baked pies' - - @document @param document : NLP Doc object - :return: Status if decoded correctly (true, false), and Inventory object- - """ - - count = Inventory(str(document)) - - for token in document: - # Look for a preposition object that is a noun (this is the item we are counting). - # If found, look at its' dependency (if a preposition that is not indicative of - # inventory location, the dependency of the preposition must be a noun - - if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'): - item = '' - - # Go through all the token's children, these are possible adjectives and other add-ons - # this deals with cases such as 'hollow rounded waffle pancakes" - for i in token.children: - item += ' ' + str(i) - - item += ' ' + str(token) - count.item = item - - # Get the head of the item: - if token.head.dep_ != u'prep': - # Break out of the loop, this is a confusing entry - break - else: - amountUnit = token.head.head - count.unit = str(amountUnit) - - for inner in amountUnit.children: - if inner.pos_ == u'NUM': - count.amount += str(inner) - return count - - diff --git a/examples/inventory_count/main.py b/examples/inventory_count/main.py deleted file mode 100644 index cbc9e25c3..000000000 --- a/examples/inventory_count/main.py +++ /dev/null @@ -1,30 +0,0 @@ -import inventoryCount as mainModule -import os -from spacy.en import English - -if __name__ == '__main__': - """ - Main module for this example - loads the English main NLP class, - and keeps it in RAM while waiting for the user to re-run it. Allows the - developer to re-edit their module under testing without having - to wait as long to load the English class - """ - - # Set the NLP object here for the parameters you want to see, - # or just leave it blank and get all the opts - print "Loading English module... this will take a while." - nlp = English() - print "Done loading English module." - while True: - try: - reload(mainModule) - mainModule.runTest(nlp) - raw_input('================ To reload main module, press Enter ================') - - - except Exception, e: - print "Unexpected error: " + str(e) - continue - - - diff --git a/examples/matcher_example.py b/examples/matcher_example.py deleted file mode 100644 index 041b98a9a..000000000 --- a/examples/matcher_example.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import unicode_literals, print_function - -import spacy.en -import spacy.matcher -from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63 - -import plac - - -def main(): - nlp = spacy.en.English() - example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works." - before = nlp(example) - print("Before") - for ent in before.ents: - print(ent.text, ent.label_, [w.tag_ for w in ent]) - # Output: - # Google ORG [u'NNP'] - # google ORG [u'VB'] - # google ORG [u'NNP'] - nlp.matcher.add( - "GoogleNow", # Entity ID: Not really used at the moment. - "PRODUCT", # Entity type: should be one of the types in the NER data - {"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused. - [ # List of patterns that can be Surface Forms of the entity - - # This Surface Form matches "Google Now", verbatim - [ # Each Surface Form is a list of Token Specifiers. - { # This Token Specifier matches tokens whose orth field is "Google" - ORTH: "Google" - }, - { # This Token Specifier matches tokens whose orth field is "Now" - ORTH: "Now" - } - ], - [ # This Surface Form matches "google now", verbatim, and requires - # "google" to have the NNP tag. This helps prevent the pattern from - # matching cases like "I will google now to look up the time" - { - ORTH: "google", - TAG: "NNP" - }, - { - ORTH: "now" - } - ] - ] - ) - after = nlp(example) - print("After") - for ent in after.ents: - print(ent.text, ent.label_, [w.tag_ for w in ent]) - # Output - # Google Now PRODUCT [u'NNP', u'RB'] - # google ORG [u'VB'] - # google now PRODUCT [u'NNP', u'RB'] - # - # You can customize attribute values in the lexicon, and then refer to the - # new attributes in your Token Specifiers. - # This is particularly good for word-set membership. - # - australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart', - 'Darwin', 'Adelaide', 'Perth'] - # Internally, the tokenizer immediately maps each token to a pointer to a - # LexemeC struct. These structs hold various features, e.g. the integer IDs - # of the normalized string forms. - # For our purposes, the key attribute is a 64-bit integer, used as a bit field. - # spaCy currently only uses 12 of the bits for its built-in features, so - # the others are available for use. It's best to use the higher bits, as - # future versions of spaCy may add more flags. For instance, we might add - # a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to - # FLAG63 here. - is_australian_capital = FLAG63 - # Now we need to set the flag value. It's False on all tokens by default, - # so we just need to set it to True for the tokens we want. - # Here we iterate over the strings, and set it on only the literal matches. - for string in australian_capitals: - lexeme = nlp.vocab[string] - lexeme.set_flag(is_australian_capital, True) - print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital)) - print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital)) - # If we want case-insensitive matching, we have to be a little bit more - # round-about, as there's no case-insensitive index to the vocabulary. So - # we have to iterate over the vocabulary. - # We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it - target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals} - for lexeme in nlp.vocab: - if lexeme.lower in target_ids: - lexeme.set_flag(is_australian_capital, True) - print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital)) - print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital)) - print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital)) - # Output - # Sydney True - # sydney False - # Sydney True - # sydney True - # SYDNEY True - # - # The key thing to note here is that we're setting these attributes once, - # over the vocabulary --- and then reusing them at run-time. This means the - # amortized complexity of anything we do this way is going to be O(1). You - # can match over expressions that need to have sets with tens of thousands - # of values, e.g. "all the street names in Germany", and you'll still have - # O(1) complexity. Most regular expression algorithms don't scale well to - # this sort of problem. - # - # Now, let's use this in a pattern - nlp.matcher.add("AuCitySportsTeam", "ORG", {}, - [ - [ - {LOWER: "the"}, - {is_australian_capital: True}, - {TAG: "NNS"} - ], - [ - {LOWER: "the"}, - {is_australian_capital: True}, - {TAG: "NNPS"} - ], - [ - {LOWER: "the"}, - {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney - {is_australian_capital: True}, - {TAG: "NNS"} - ], - [ - {LOWER: "the"}, - {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney - {is_australian_capital: True}, - {TAG: "NNPS"} - ] - ]) - doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders') - for ent in doc.ents: - print(ent.text, ent.label_) - # Output - # the Brisbane Broncos ORG - # the South Darwin Spiders ORG - - -# Output -# Before -# Google ORG [u'NNP'] -# google ORG [u'VB'] -# google ORG [u'NNP'] -# After -# Google Now PRODUCT [u'NNP', u'RB'] -# google ORG [u'VB'] -# google now PRODUCT [u'NNP', u'RB'] -# Sydney True -# sydney False -# Sydney True -# sydney True -# SYDNEY True -# the Brisbane Broncos ORG -# the South Darwin Spiders ORG - -if __name__ == '__main__': - main() - diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py deleted file mode 100644 index 73f48bf42..000000000 --- a/examples/multi_word_matches.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Match a large set of multi-word expressions in O(1) time. - -The idea is to associate each word in the vocabulary with a tag, noting whether -they begin, end, or are inside at least one pattern. An additional tag is used -for single-word patterns. Complete patterns are also stored in a hash set. - -When we process a document, we look up the words in the vocabulary, to associate -the words with the tags. We then search for tag-sequences that correspond to -valid candidates. Finally, we look up the candidates in the hash set. - -For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we -would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag, -and Obama and Clinton with the L tag. - -The document "Barack Clinton and Hilary Clinton" would have the tag sequence -[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate -is in the phrase dictionary, so only one is returned as a match. - -The algorithm is O(n) at run-time for document of length n because we're only ever -matching over the tag patterns. So no matter how many phrases we're looking for, -our pattern set stays very small (exact size depends on the maximum length we're -looking for, as the query language currently has no quantifiers) -""" -from __future__ import print_function, unicode_literals, division -from ast import literal_eval -from bz2 import BZ2File -import time -import math -import codecs - -import plac - -from preshed.maps import PreshMap -from preshed.counter import PreshCounter -from spacy.strings import hash_string -from spacy.en import English -from spacy.matcher import PhraseMatcher - - -def read_gazetteer(tokenizer, loc, n=-1): - for i, line in enumerate(open(loc)): - phrase = literal_eval('u' + line.strip()) - if ' (' in phrase and phrase.endswith(')'): - phrase = phrase.split(' (', 1)[0] - if i >= n: - break - phrase = tokenizer(phrase) - if all((t.is_lower and t.prob >= -10) for t in phrase): - continue - if len(phrase) >= 2: - yield phrase - - -def read_text(bz2_loc): - with BZ2File(bz2_loc) as file_: - for line in file_: - yield line.decode('utf8') - - -def get_matches(tokenizer, phrases, texts, max_length=6): - matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) - print("Match") - for text in texts: - doc = tokenizer(text) - matches = matcher(doc) - for mwe in doc.ents: - yield mwe - - -def main(patterns_loc, text_loc, counts_loc, n=10000000): - nlp = English(parser=False, tagger=False, entity=False) - print("Make matcher") - phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) - counts = PreshCounter() - t1 = time.time() - for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): - counts.inc(hash_string(mwe.text), 1) - t2 = time.time() - print("10m tokens in %d s" % (t2 - t1)) - - with codecs.open(counts_loc, 'w', 'utf8') as file_: - for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): - text = phrase.string - key = hash_string(text) - count = counts[key] - if count != 0: - file_.write('%d\t%s\n' % (count, text)) - - -if __name__ == '__main__': - if False: - import cProfile - import pstats - cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() - else: - plac.call(main) diff --git a/examples/nn_text_class.py b/examples/nn_text_class.py deleted file mode 100644 index 7b4a2fd57..000000000 --- a/examples/nn_text_class.py +++ /dev/null @@ -1,281 +0,0 @@ -"""This script expects something like a binary sentiment data set, such as - that available here: `http://www.cs.cornell.edu/people/pabo/movie-review-data/` - -It expects a directory structure like: `data_dir/train/{pos|neg}` - and `data_dir/test/{pos|neg}`. Put (say) 90% of the files in the former - and the remainder in the latter. -""" - -from __future__ import unicode_literals -from __future__ import print_function -from __future__ import division - -from collections import defaultdict -from pathlib import Path -import numpy -import plac - -import spacy.en - - -def read_data(nlp, data_dir): - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - text = filename.open().read() - doc = nlp(text) - if len(doc) >= 1: - yield doc, label - - -def partition(examples, split_size): - examples = list(examples) - numpy.random.shuffle(examples) - n_docs = len(examples) - split = int(n_docs * split_size) - return examples[:split], examples[split:] - - -def minibatch(data, bs=24): - for i in range(0, len(data), bs): - yield data[i:i+bs] - - -class Extractor(object): - def __init__(self, nlp, vector_length, dropout=0.3): - self.nlp = nlp - self.dropout = dropout - self.vector = numpy.zeros((vector_length, )) - - def doc2bow(self, doc, dropout=None): - if dropout is None: - dropout = self.dropout - bow = defaultdict(int) - all_words = defaultdict(int) - for word in doc: - if numpy.random.random() >= dropout and not word.is_punct: - bow[word.lower] += 1 - all_words[word.lower] += 1 - if sum(bow.values()) >= 1: - return bow - else: - return all_words - - def bow2vec(self, bow, E): - self.vector.fill(0) - n = 0 - for orth_id, freq in bow.items(): - self.vector += self.nlp.vocab[self.nlp.vocab.strings[orth_id]].vector * freq - # Apply the fine-tuning we've learned - if orth_id < E.shape[0]: - self.vector += E[orth_id] * freq - n += freq - return self.vector / n - - -class NeuralNetwork(object): - def __init__(self, depth, width, n_classes, n_vocab, extracter, optimizer): - self.depth = depth - self.width = width - self.n_classes = n_classes - self.weights = Params.random(depth, width, width, n_classes, n_vocab) - self.doc2bow = extracter.doc2bow - self.bow2vec = extracter.bow2vec - self.optimizer = optimizer - self._gradient = Params.zero(depth, width, width, n_classes, n_vocab) - self._activity = numpy.zeros((depth, width)) - - def train(self, batch): - activity = self._activity - gradient = self._gradient - activity.fill(0) - gradient.data.fill(0) - loss = 0 - word_freqs = defaultdict(int) - for doc, label in batch: - word_ids = self.doc2bow(doc) - vector = self.bow2vec(word_ids, self.weights.E) - self.forward(activity, vector) - loss += self.backprop(vector, gradient, activity, word_ids, label) - for w, freq in word_ids.items(): - word_freqs[w] += freq - self.optimizer(self.weights, gradient, len(batch), word_freqs) - return loss - - def predict(self, doc): - actv = self._activity - actv.fill(0) - W = self.weights.W - b = self.weights.b - E = self.weights.E - - vector = self.bow2vec(self.doc2bow(doc, dropout=0.0), E) - self.forward(actv, vector) - return numpy.argmax(softmax(actv[-1], W[-1], b[-1])) - - def forward(self, actv, in_): - actv.fill(0) - W = self.weights.W; b = self.weights.b - actv[0] = relu(in_, W[0], b[0]) - for i in range(1, self.depth): - actv[i] = relu(actv[i-1], W[i], b[i]) - - def backprop(self, input_vector, gradient, activity, ids, label): - W = self.weights.W - b = self.weights.b - - target = numpy.zeros(self.n_classes) - target[label] = 1.0 - pred = softmax(activity[-1], W[-1], b[-1]) - delta = pred - target - - for i in range(self.depth, 0, -1): - gradient.b[i] += delta - gradient.W[i] += numpy.outer(delta, activity[i-1]) - delta = d_relu(activity[i-1]) * W[i].T.dot(delta) - - gradient.b[0] += delta - gradient.W[0] += numpy.outer(delta, input_vector) - tuning = W[0].T.dot(delta).reshape((self.width,)) / len(ids) - for w, freq in ids.items(): - if w < gradient.E.shape[0]: - gradient.E[w] += tuning * freq - return -sum(target * numpy.log(pred)) - - -def softmax(actvn, W, b): - w = W.dot(actvn) + b - ew = numpy.exp(w - max(w)) - return (ew / sum(ew)).ravel() - - -def relu(actvn, W, b): - x = W.dot(actvn) + b - return x * (x > 0) - - -def d_relu(x): - return x > 0 - - -class Adagrad(object): - def __init__(self, lr, rho): - self.eps = 1e-3 - # initial learning rate - self.learning_rate = lr - self.rho = rho - # stores sum of squared gradients - #self.h = numpy.zeros(self.dim) - #self._curr_rate = numpy.zeros(self.h.shape) - self.h = None - self._curr_rate = None - - def __call__(self, weights, gradient, batch_size, word_freqs): - if self.h is None: - self.h = numpy.zeros(gradient.data.shape) - self._curr_rate = numpy.zeros(gradient.data.shape) - self.L2_penalty(gradient, weights, word_freqs) - update = self.rescale(gradient.data / batch_size) - weights.data -= update - - def rescale(self, gradient): - if self.h is None: - self.h = numpy.zeros(gradient.data.shape) - self._curr_rate = numpy.zeros(gradient.data.shape) - self._curr_rate.fill(0) - self.h += gradient ** 2 - self._curr_rate = self.learning_rate / (numpy.sqrt(self.h) + self.eps) - return self._curr_rate * gradient - - def L2_penalty(self, gradient, weights, word_freqs): - # L2 Regularization - for i in range(len(weights.W)): - gradient.W[i] += weights.W[i] * self.rho - gradient.b[i] += weights.b[i] * self.rho - for w, freq in word_freqs.items(): - if w < gradient.E.shape[0]: - gradient.E[w] += weights.E[w] * self.rho - - -class Params(object): - @classmethod - def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab): - return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: numpy.zeros((x,))) - - @classmethod - def random(cls, depth, nE, nH, nL, nV): - return cls(depth, nE, nH, nL, nV, lambda x: (numpy.random.rand(x) * 2 - 1) * 0.08) - - def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer): - nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab - n_weights = sum([ - (nE * nH) + nH, - (nH * nH + nH) * depth, - (nH * nL) + nL, - (nV * nE) - ]) - self.data = initializer(n_weights) - self.W = [] - self.b = [] - i = self._add_layer(0, nE, nH) - for _ in range(1, depth): - i = self._add_layer(i, nH, nH) - i = self._add_layer(i, nL, nH) - self.E = self.data[i : i + (nV * nE)].reshape((nV, nE)) - self.E.fill(0) - - def _add_layer(self, start, x, y): - end = start + (x * y) - self.W.append(self.data[start : end].reshape((x, y))) - self.b.append(self.data[end : end + x].reshape((x, ))) - return end + x - - -@plac.annotations( - data_dir=("Data directory", "positional", None, Path), - n_iter=("Number of iterations (epochs)", "option", "i", int), - width=("Size of hidden layers", "option", "H", int), - depth=("Depth", "option", "d", int), - dropout=("Drop-out rate", "option", "r", float), - rho=("Regularization penalty", "option", "p", float), - eta=("Learning rate", "option", "e", float), - batch_size=("Batch size", "option", "b", int), - vocab_size=("Number of words to fine-tune", "option", "w", int), -) -def main(data_dir, depth=3, width=300, n_iter=5, vocab_size=40000, - batch_size=24, dropout=0.3, rho=1e-5, eta=0.005): - n_classes = 2 - print("Loading") - nlp = spacy.en.English(parser=False) - train_data, dev_data = partition(read_data(nlp, data_dir / 'train'), 0.8) - print("Begin training") - extracter = Extractor(nlp, width, dropout=0.3) - optimizer = Adagrad(eta, rho) - model = NeuralNetwork(depth, width, n_classes, vocab_size, extracter, optimizer) - prev_best = 0 - best_weights = None - for epoch in range(n_iter): - numpy.random.shuffle(train_data) - train_loss = 0.0 - for batch in minibatch(train_data, bs=batch_size): - train_loss += model.train(batch) - n_correct = sum(model.predict(x) == y for x, y in dev_data) - print(epoch, train_loss, n_correct / len(dev_data)) - if n_correct >= prev_best: - best_weights = model.weights.data.copy() - prev_best = n_correct - - model.weights.data = best_weights - print("Evaluating") - eval_data = list(read_data(nlp, data_dir / 'test')) - n_correct = sum(model.predict(x) == y for x, y in eval_data) - print(n_correct / len(eval_data)) - - - -if __name__ == '__main__': - #import cProfile - #import pstats - #cProfile.runctx("main(Path('data/aclImdb'))", globals(), locals(), "Profile.prof") - #s = pstats.Stats("Profile.prof") - #s.strip_dirs().sort_stats("time").print_stats(100) - plac.call(main) diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py deleted file mode 100644 index 5cdd0778b..000000000 --- a/examples/parallel_parse.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path -import re - -import spacy.en -from spacy.tokens import Doc - -from joblib import Parallel, delayed -import plac -import ujson - - -def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra)) - for item in iterator) - - -def iter_comments(loc): - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -pre_format_re = re.compile(r'^[\`\*\~]') -post_format_re = re.compile(r'[\`\*\~]$') -url_re = re.compile(r'\[([^]]+)\]\(%%URL\)') -link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)') -def strip_meta(text): - text = link_re.sub(r'\1', text) - text = text.replace('>', '>').replace('<', '<') - text = pre_format_re.sub('', text) - text = post_format_re.sub('', text) - return text.strip() - - -def save_parses(batch_id, input_, out_dir, n_threads, batch_size): - out_loc = path.join(out_dir, '%d.bin' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English() - nlp.matcher = None - with open(out_loc, 'wb') as file_: - texts = (strip_meta(text) for text in input_) - texts = (text for text in texts if text.strip()) - for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads): - file_.write(doc.to_bytes()) - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_process=("Number of processes", "option", "p", int), - n_thread=("Number of threads per process", "option", "t", int), - batch_size=("Number of texts to accumulate in a buffer", "option", "b", int) -) -def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100): - if not path.exists(out_dir): - path.join(out_dir) - if n_process >= 2: - texts = partition(200000, iter_comments(in_loc)) - parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size], - backend='multiprocessing') - else: - save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size) - - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py new file mode 100644 index 000000000..18d6b482a --- /dev/null +++ b/examples/pipeline/custom_attr_methods.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# coding: utf-8 +"""This example contains several snippets of methods that can be set via custom +Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like +they're "bound" to the object and are partially applied – i.e. the object +they're called on is passed in as the first argument. + +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +from spacy.lang.en import English +from spacy.tokens import Doc, Span +from spacy import displacy +from pathlib import Path + + +@plac.annotations( + output_dir=("Output directory for saved HTML", "positional", None, Path)) +def main(output_dir=None): + nlp = English() # start off with blank English class + + Doc.set_extension('overlap', method=overlap_tokens) + doc1 = nlp(u"Peach emoji is where it has always been.") + doc2 = nlp(u"Peach is the superior emoji.") + print("Text 1:", doc1.text) + print("Text 2:", doc2.text) + print("Overlapping tokens:", doc1._.overlap(doc2)) + + Doc.set_extension('to_html', method=to_html) + doc = nlp(u"This is a sentence about Apple.") + # add entity manually for demo purposes, to make it work without a model + doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] + print("Text:", doc.text) + doc._.to_html(output=output_dir, style='ent') + + +def to_html(doc, output='/tmp', style='dep'): + """Doc method extension for saving the current state as a displaCy + visualization. + """ + # generate filename from first six non-punct tokens + file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' + html = displacy.render(doc, style=style, page=True) # render markup + if output is not None: + output_path = Path(output) + if not output_path.exists(): + output_path.mkdir() + output_file = Path(output) / file_name + output_file.open('w', encoding='utf-8').write(html) # save to file + print('Saved HTML to {}'.format(output_file)) + else: + print(html) + + +def overlap_tokens(doc, other_doc): + """Get the tokens from the original Doc that are also in the comparison Doc. + """ + overlap = [] + other_tokens = [token.text for token in other_doc] + for token in doc: + if token.text in other_tokens: + overlap.append(token) + return overlap + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # Text 1: Peach emoji is where it has always been. + # Text 2: Peach is the superior emoji. + # Overlapping tokens: [Peach, emoji, is, .] diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py new file mode 100644 index 000000000..e7371e205 --- /dev/null +++ b/examples/pipeline/custom_component_countries_api.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# coding: utf8 +"""Example of a spaCy v2.0 pipeline component that requests all countries via +the REST Countries API, merges country names into one token, assigns entity +labels and sets attributes on country tokens, e.g. the capital and lat/lng +coordinates. Can be extended with more details from the API. + +* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0) +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import requests +import plac +from spacy.lang.en import English +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc, Span, Token + + +def main(): + # For simplicity, we start off with only the blank English Language class + # and no model or pre-defined pipeline loaded. + nlp = English() + rest_countries = RESTCountriesComponent(nlp) # initialise component + nlp.add_pipe(rest_countries) # add it to the pipeline + doc = nlp(u"Some text about Colombia and the Czech Republic") + print('Pipeline', nlp.pipe_names) # pipeline contains component name + print('Doc has countries', doc._.has_country) # Doc contains countries + for token in doc: + if token._.is_country: + print(token.text, token._.country_capital, token._.country_latlng, + token._.country_flag) # country data + print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities + + +class RESTCountriesComponent(object): + """spaCy v2.0 pipeline component that requests all countries via + the REST Countries API, merges country names into one token, assigns entity + labels and sets attributes on country tokens. + """ + name = 'rest_countries' # component name, will show up in the pipeline + + def __init__(self, nlp, label='GPE'): + """Initialise the pipeline component. The shared nlp instance is used + to initialise the matcher with the shared vocab, get the label ID and + generate Doc objects as phrase match patterns. + """ + # Make request once on initialisation and store the data + r = requests.get('https://restcountries.eu/rest/v2/all') + r.raise_for_status() # make sure requests raises an error if it fails + countries = r.json() + + # Convert API response to dict keyed by country name for easy lookup + # This could also be extended using the alternative and foreign language + # names provided by the API + self.countries = {c['name']: c for c in countries} + self.label = nlp.vocab.strings[label] # get entity label ID + + # Set up the PhraseMatcher with Doc patterns for each country name + patterns = [nlp(c) for c in self.countries.keys()] + self.matcher = PhraseMatcher(nlp.vocab) + self.matcher.add('COUNTRIES', None, *patterns) + + # Register attribute on the Token. We'll be overwriting this based on + # the matches, so we're only setting a default value, not a getter. + # If no default value is set, it defaults to None. + Token.set_extension('is_country', default=False) + Token.set_extension('country_capital') + Token.set_extension('country_latlng') + Token.set_extension('country_flag') + + # Register attributes on Doc and Span via a getter that checks if one of + # the contained tokens is set to is_country == True. + Doc.set_extension('has_country', getter=self.has_country) + Span.set_extension('has_country', getter=self.has_country) + + + def __call__(self, doc): + """Apply the pipeline component on a Doc object and modify it if matches + are found. Return the Doc, so it can be processed by the next component + in the pipeline, if available. + """ + matches = self.matcher(doc) + spans = [] # keep the spans for later so we can merge them afterwards + for _, start, end in matches: + # Generate Span representing the entity & set label + entity = Span(doc, start, end, label=self.label) + spans.append(entity) + # Set custom attribute on each token of the entity + # Can be extended with other data returned by the API, like + # currencies, country code, flag, calling code etc. + for token in entity: + token._.set('is_country', True) + token._.set('country_capital', self.countries[entity.text]['capital']) + token._.set('country_latlng', self.countries[entity.text]['latlng']) + token._.set('country_flag', self.countries[entity.text]['flag']) + # Overwrite doc.ents and add entity – be careful not to replace! + doc.ents = list(doc.ents) + [entity] + for span in spans: + # Iterate over all spans and merge them into one token. This is done + # after setting the entities – otherwise, it would cause mismatched + # indices! + span.merge() + return doc # don't forget to return the Doc! + + def has_country(self, tokens): + """Getter for Doc and Span attributes. Returns True if one of the tokens + is a country. Since the getter is only called when we access the + attribute, we can refer to the Token's 'is_country' attribute here, + which is already set in the processing step.""" + return any([t._.get('is_country') for t in tokens]) + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # Pipeline ['rest_countries'] + # Doc has countries True + # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg + # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg + # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')] diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py new file mode 100644 index 000000000..6b78744b7 --- /dev/null +++ b/examples/pipeline/custom_component_entities.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# coding: utf8 +"""Example of a spaCy v2.0 pipeline component that sets entity annotations +based on list of single or multiple-word company names. Companies are +labelled as ORG and their spans are merged into one token. Additionally, +._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token +respectively. + +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +from spacy.lang.en import English +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc, Span, Token + + +@plac.annotations( + text=("Text to process", "positional", None, str), + companies=("Names of technology companies", "positional", None, str)) +def main(text="Alphabet Inc. is the company behind Google.", *companies): + # For simplicity, we start off with only the blank English Language class + # and no model or pre-defined pipeline loaded. + nlp = English() + if not companies: # set default companies if none are set via args + companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. + component = TechCompanyRecognizer(nlp, companies) # initialise component + nlp.add_pipe(component, last=True) # add last to the pipeline + + doc = nlp(text) + print('Pipeline', nlp.pipe_names) # pipeline contains component name + print('Tokens', [t.text for t in doc]) # company names from the list are merged + print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs + print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org + print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not + print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities + + +class TechCompanyRecognizer(object): + """Example of a spaCy v2.0 pipeline component that sets entity annotations + based on list of single or multiple-word company names. Companies are + labelled as ORG and their spans are merged into one token. Additionally, + ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token + respectively.""" + name = 'tech_companies' # component name, will show up in the pipeline + + def __init__(self, nlp, companies=tuple(), label='ORG'): + """Initialise the pipeline component. The shared nlp instance is used + to initialise the matcher with the shared vocab, get the label ID and + generate Doc objects as phrase match patterns. + """ + self.label = nlp.vocab.strings[label] # get entity label ID + + # Set up the PhraseMatcher – it can now take Doc objects as patterns, + # so even if the list of companies is long, it's very efficient + patterns = [nlp(org) for org in companies] + self.matcher = PhraseMatcher(nlp.vocab) + self.matcher.add('TECH_ORGS', None, *patterns) + + # Register attribute on the Token. We'll be overwriting this based on + # the matches, so we're only setting a default value, not a getter. + Token.set_extension('is_tech_org', default=False) + + # Register attributes on Doc and Span via a getter that checks if one of + # the contained tokens is set to is_tech_org == True. + Doc.set_extension('has_tech_org', getter=self.has_tech_org) + Span.set_extension('has_tech_org', getter=self.has_tech_org) + + def __call__(self, doc): + """Apply the pipeline component on a Doc object and modify it if matches + are found. Return the Doc, so it can be processed by the next component + in the pipeline, if available. + """ + matches = self.matcher(doc) + spans = [] # keep the spans for later so we can merge them afterwards + for _, start, end in matches: + # Generate Span representing the entity & set label + entity = Span(doc, start, end, label=self.label) + spans.append(entity) + # Set custom attribute on each token of the entity + for token in entity: + token._.set('is_tech_org', True) + # Overwrite doc.ents and add entity – be careful not to replace! + doc.ents = list(doc.ents) + [entity] + for span in spans: + # Iterate over all spans and merge them into one token. This is done + # after setting the entities – otherwise, it would cause mismatched + # indices! + span.merge() + return doc # don't forget to return the Doc! + + def has_tech_org(self, tokens): + """Getter for Doc and Span attributes. Returns True if one of the tokens + is a tech org. Since the getter is only called when we access the + attribute, we can refer to the Token's 'is_tech_org' attribute here, + which is already set in the processing step.""" + return any([t._.get('is_tech_org') for t in tokens]) + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # Pipeline ['tech_companies'] + # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.'] + # Doc has_tech_org True + # Token 0 is_tech_org True + # Token 1 is_tech_org False + # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')] diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py new file mode 100644 index 000000000..19b1c462a --- /dev/null +++ b/examples/pipeline/multi_processing.py @@ -0,0 +1,73 @@ +""" +Example of multi-processing with Joblib. Here, we're exporting +part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with +each "sentence" on a newline, and spaces between tokens. Data is loaded from +the IMDB movie reviews dataset and will be loaded automatically via Thinc's +built-in dataset loader. + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import print_function, unicode_literals +from toolz import partition_all +from pathlib import Path +from joblib import Parallel, delayed +import thinc.extra.datasets +import plac +import spacy + + +@plac.annotations( + output_dir=("Output directory", "positional", None, Path), + model=("Model name (needs tagger)", "positional", None, str), + n_jobs=("Number of workers", "option", "n", int), + batch_size=("Batch-size for each process", "option", "b", int), + limit=("Limit of entries from the dataset", "option", "l", int)) +def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, + limit=10000): + nlp = spacy.load(model) # load spaCy model + print("Loaded model '%s'" % model) + if not output_dir.exists(): + output_dir.mkdir() + # load and pre-process the IMBD dataset + print("Loading IMDB data...") + data, _ = thinc.extra.datasets.imdb() + texts, _ = zip(*data[-limit:]) + partitions = partition_all(batch_size, texts) + items = ((i, [nlp(text) for text in texts], output_dir) for i, texts + in enumerate(partitions)) + Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items) + + +def transform_texts(batch_id, docs, output_dir): + out_path = Path(output_dir) / ('%d.txt' % batch_id) + if out_path.exists(): # return None in case same batch is called again + return None + print('Processing batch', batch_id) + with out_path.open('w', encoding='utf8') as f: + for doc in docs: + f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) + f.write('\n') + print('Saved {} texts to {}.txt'.format(len(docs), batch_id)) + + +def represent_word(word): + text = word.text + # True-case, i.e. try to normalize sentence-initial capitals. + # Only do this if the lower-cased form is more probable. + if text.istitle() and is_sent_begin(word) \ + and word.prob < word.doc.vocab[text.lower()].prob: + text = text.lower() + return text + '|' + word.tag_ + + +def is_sent_begin(word): + if word.i == 0: + return True + elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): + return True + else: + return False + + +if __name__ == '__main__': + plac.call(main) diff --git a/examples/pos_tag.py b/examples/pos_tag.py deleted file mode 100644 index 1dd6add0f..000000000 --- a/examples/pos_tag.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Print part-of-speech tagged, true-cased, (very roughly) sentence-separated -text, with each "sentence" on a newline, and spaces between tokens. Supports -multi-processing. -""" -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path - -import spacy.en - -from joblib import Parallel, delayed -import plac -import ujson - - -def parallelize(func, iterator, n_jobs, extra): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator) - - -def iter_texts_from_json_bz2(loc): - """ - Iterator of unicode strings, one per document (here, a comment). - - Expects a a path to a BZ2 file, which should be new-line delimited JSON. The - document text should be in a string field titled 'body'. - - This is the data format of the Reddit comments corpus. - """ - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -def transform_texts(batch_id, input_, out_dir): - out_loc = path.join(out_dir, '%d.txt' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English(parser=False, entity=False) - with io.open(out_loc, 'w', encoding='utf8') as file_: - for text in input_: - doc = nlp(text) - file_.write(' '.join(represent_word(w) for w in doc if not w.is_space)) - file_.write('\n') - - -def represent_word(word): - text = word.text - # True-case, i.e. try to normalize sentence-initial capitals. - # Only do this if the lower-cased form is more probable. - if text.istitle() \ - and is_sent_begin(word) \ - and word.prob < word.doc.vocab[text.lower()].prob: - text = text.lower() - return text + '|' + word.tag_ - - -def is_sent_begin(word): - # It'd be nice to have some heuristics like these in the library, for these - # times where we don't care so much about accuracy of SBD, and we don't want - # to parse - if word.i == 0: - return True - elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): - return True - else: - return False - - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_workers=("Number of workers", "option", "n", int), - batch_size=("Batch-size for each process", "option", "b", int) -) -def main(in_loc, out_dir, n_workers=4, batch_size=100000): - if not path.exists(out_dir): - path.join(out_dir) - texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) - parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) - - -if __name__ == '__main__': - plac.call(main) - diff --git a/examples/training/load_ner.py b/examples/training/load_ner.py deleted file mode 100644 index bf81cee50..000000000 --- a/examples/training/load_ner.py +++ /dev/null @@ -1,22 +0,0 @@ -# Load NER -from __future__ import unicode_literals -import spacy -import pathlib -from spacy.pipeline import EntityRecognizer -from spacy.vocab import Vocab - -def load_model(model_dir): - model_dir = pathlib.Path(model_dir) - nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) - with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: - nlp.vocab.strings.load(file_) - nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') - ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True) - return (nlp, ner) - -(nlp, ner) = load_model('ner') -doc = nlp.make_doc('Who is Shaka Khan?') -nlp.tagger(doc) -ner(doc) -for word in doc: - print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py new file mode 100644 index 000000000..def0ed370 --- /dev/null +++ b/examples/training/train_intent_parser.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# coding: utf-8 +"""Using the parser to recognise your own semantics + +spaCy's parser component can be used to trained to predict any type of tree +structure over your input text. You can also predict trees over whole documents +or chat logs, with connections between the sentence-roots used to annotate +discourse structure. In this example, we'll build a message parser for a common +"chat intent": finding local businesses. Our message semantics will have the +following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. + +"show me the best hotel in berlin" +('show', 'ROOT', 'show') +('best', 'QUALITY', 'hotel') --> hotel with QUALITY best +('hotel', 'PLACE', 'show') --> show PLACE hotel +('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin +""" +from __future__ import unicode_literals, print_function + +import plac +import random +import spacy +from spacy.gold import GoldParse +from spacy.tokens import Doc +from pathlib import Path + + +# training data: words, head and dependency labels +# for no relation, we simply chose an arbitrary dependency label, e.g. '-' +TRAIN_DATA = [ + ( + ['find', 'a', 'cafe', 'with', 'great', 'wifi'], + [0, 2, 0, 5, 5, 2], # index of token head + ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] + ), + ( + ['find', 'a', 'hotel', 'near', 'the', 'beach'], + [0, 2, 0, 5, 5, 2], + ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] + ), + ( + ['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'], + [0, 0, 4, 4, 0, 6, 4, 6, 6], + ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] + ), + ( + ['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'], + [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! + ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] + ), + ( + ['find', 'a', 'nice', 'restaurant', 'in', 'london'], + [0, 3, 3, 0, 3, 3], + ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] + ), + ( + ['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'], + [0, 0, 4, 4, 0, 4, 4], + ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] + ), + ( + ['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'], + [0, 4, 4, 4, 0, 4, 5], + ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] + ) +] + + +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, output_dir=None, n_iter=100): + """Load the model, set up the pipeline and train the parser.""" + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") + + # add the parser to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'parser' not in nlp.pipe_names: + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser, first=True) + # otherwise, get it, so we can add labels to it + else: + parser = nlp.get_pipe('parser') + + for _, _, deps in TRAIN_DATA: + for dep in deps: + parser.add_label(dep) + + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] + with nlp.disable_pipes(*other_pipes): # only train parser + optimizer = nlp.begin_training(lambda: []) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, heads, deps in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, heads=heads, deps=deps) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) + + # test the trained model + test_model(nlp) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + test_model(nlp2) + + +def test_model(nlp): + texts = ["find a hotel with good wifi", + "find me the cheapest gym near work", + "show me the best hotel in berlin"] + docs = nlp.pipe(texts) + for doc in docs: + print(doc.text) + print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # find a hotel with good wifi + # [ + # ('find', 'ROOT', 'find'), + # ('hotel', 'PLACE', 'find'), + # ('good', 'QUALITY', 'wifi'), + # ('wifi', 'ATTRIBUTE', 'hotel') + # ] + # find me the cheapest gym near work + # [ + # ('find', 'ROOT', 'find'), + # ('cheapest', 'QUALITY', 'gym'), + # ('gym', 'PLACE', 'find') + # ] + # show me the best hotel in berlin + # [ + # ('show', 'ROOT', 'show'), + # ('best', 'QUALITY', 'hotel'), + # ('hotel', 'PLACE', 'show'), + # ('berlin', 'LOCATION', 'hotel') + # ] diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index e9ae013d3..499807d23 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -1,13 +1,103 @@ +#!/usr/bin/env python +# coding: utf8 +""" +Example of training spaCy's named entity recognizer, starting off with an +existing model or a blank model. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals, print_function +import plac import random +from pathlib import Path -from spacy.lang.en import English +import spacy from spacy.gold import GoldParse, biluo_tags_from_offsets +# training data +TRAIN_DATA = [ + ('Who is Shaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) +] + + +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, output_dir=None, n_iter=100): + """Load the model, set up the pipeline and train the entity recognizer.""" + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") + + # create the built-in pipeline components and add them to the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'ner' not in nlp.pipe_names: + ner = nlp.create_pipe('ner') + nlp.add_pipe(ner, last=True) + + # function that allows begin_training to get the training data + get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA) + + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] + with nlp.disable_pipes(*other_pipes): # only train NER + optimizer = nlp.begin_training(get_data) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for raw_text, entity_offsets in TRAIN_DATA: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + nlp.update( + [doc], # Batch of Doc objects + [gold], # Batch of GoldParse objects + drop=0.5, # Dropout -- make it harder to memorise data + sgd=optimizer, # Callable to update weights + losses=losses) + print(losses) + + # test the trained model + for text, _ in TRAIN_DATA: + doc = nlp(text) + print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) + print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + for text, _ in TRAIN_DATA: + doc = nlp2(text) + print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) + print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) + + def reformat_train_data(tokenizer, examples): - """Reformat data to match JSON format""" + """Reformat data to match JSON format. + https://alpha.spacy.io/api/annotation#json-input + + tokenizer (Tokenizer): Tokenizer to process the raw text. + examples (list): The trainig data. + RETURNS (list): The reformatted training data.""" output = [] for i, (text, entity_offsets) in enumerate(examples): doc = tokenizer(text) @@ -21,49 +111,5 @@ def reformat_train_data(tokenizer, examples): return output -def main(model_dir=None): - train_data = [ - ( - 'Who is Shaka Khan?', - [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] - ), - ( - 'I like London and Berlin.', - [(len('I like '), len('I like London'), 'LOC'), - (len('I like London and '), len('I like London and Berlin'), 'LOC')] - ) - ] - nlp = English(pipeline=['tensorizer', 'ner']) - get_data = lambda: reformat_train_data(nlp.tokenizer, train_data) - optimizer = nlp.begin_training(get_data) - for itn in range(100): - random.shuffle(train_data) - losses = {} - for raw_text, entity_offsets in train_data: - doc = nlp.make_doc(raw_text) - gold = GoldParse(doc, entities=entity_offsets) - nlp.update( - [doc], # Batch of Doc objects - [gold], # Batch of GoldParse objects - drop=0.5, # Dropout -- make it harder to memorise data - sgd=optimizer, # Callable to update weights - losses=losses) - print(losses) - print("Save to", model_dir) - nlp.to_disk(model_dir) - print("Load from", model_dir) - nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner']) - nlp.from_disk(model_dir) - for raw_text, _ in train_data: - doc = nlp(raw_text) - for word in doc: - print(word.text, word.ent_type_, word.ent_iob_) - if __name__ == '__main__': - import plac plac.call(main) - # Who "" 2 - # is "" 2 - # Shaka "" PERSON 3 - # Khan "" PERSON 1 - # ? "" 2 diff --git a/examples/training/train_ner_standalone.py b/examples/training/train_ner_standalone.py deleted file mode 100644 index 9591d1b71..000000000 --- a/examples/training/train_ner_standalone.py +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/env python -'''Example of training a named entity recognition system from scratch using spaCy - -This example is written to be self-contained and reasonably transparent. -To achieve that, it duplicates some of spaCy's internal functionality. - -Specifically, in this example, we don't use spaCy's built-in Language class to -wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write -our own simle Pipeline class, so that it's easier to see how the pieces -interact. - -Input data: -https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip - -Developed for: spaCy 1.7.1 -Last tested for: spaCy 1.7.1 -''' -from __future__ import unicode_literals, print_function -import plac -from pathlib import Path -import random -import json - -import spacy.orth as orth_funcs -from spacy.vocab import Vocab -from spacy.pipeline import BeamEntityRecognizer -from spacy.pipeline import EntityRecognizer -from spacy.tokenizer import Tokenizer -from spacy.tokens import Doc -from spacy.attrs import * -from spacy.gold import GoldParse -from spacy.gold import _iob_to_biluo as iob_to_biluo -from spacy.scorer import Scorer - -try: - unicode -except NameError: - unicode = str - - -def init_vocab(): - return Vocab( - lex_attr_getters={ - LOWER: lambda string: string.lower(), - SHAPE: orth_funcs.word_shape, - PREFIX: lambda string: string[0], - SUFFIX: lambda string: string[-3:], - CLUSTER: lambda string: 0, - IS_ALPHA: orth_funcs.is_alpha, - IS_ASCII: orth_funcs.is_ascii, - IS_DIGIT: lambda string: string.isdigit(), - IS_LOWER: orth_funcs.is_lower, - IS_PUNCT: orth_funcs.is_punct, - IS_SPACE: lambda string: string.isspace(), - IS_TITLE: orth_funcs.is_title, - IS_UPPER: orth_funcs.is_upper, - IS_STOP: lambda string: False, - IS_OOV: lambda string: True - }) - - -def save_vocab(vocab, path): - path = Path(path) - if not path.exists(): - path.mkdir() - elif not path.is_dir(): - raise IOError("Can't save vocab to %s\nNot a directory" % path) - with (path / 'strings.json').open('w') as file_: - vocab.strings.dump(file_) - vocab.dump((path / 'lexemes.bin').as_posix()) - - -def load_vocab(path): - path = Path(path) - if not path.exists(): - raise IOError("Cannot load vocab from %s\nDoes not exist" % path) - if not path.is_dir(): - raise IOError("Cannot load vocab from %s\nNot a directory" % path) - return Vocab.load(path) - - -def init_ner_model(vocab, features=None): - if features is None: - features = tuple(EntityRecognizer.feature_templates) - return EntityRecognizer(vocab, features=features) - - -def save_ner_model(model, path): - path = Path(path) - if not path.exists(): - path.mkdir() - if not path.is_dir(): - raise IOError("Can't save model to %s\nNot a directory" % path) - model.model.dump((path / 'model').as_posix()) - with (path / 'config.json').open('w') as file_: - data = json.dumps(model.cfg) - if not isinstance(data, unicode): - data = data.decode('utf8') - file_.write(data) - - -def load_ner_model(vocab, path): - return EntityRecognizer.load(path, vocab) - - -class Pipeline(object): - @classmethod - def load(cls, path): - path = Path(path) - if not path.exists(): - raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) - if not path.is_dir(): - raise IOError("Cannot load pipeline from %s\nNot a directory" % path) - vocab = load_vocab(path) - tokenizer = Tokenizer(vocab, {}, None, None, None) - ner_model = load_ner_model(vocab, path / 'ner') - return cls(vocab, tokenizer, ner_model) - - def __init__(self, vocab=None, tokenizer=None, entity=None): - if vocab is None: - vocab = init_vocab() - if tokenizer is None: - tokenizer = Tokenizer(vocab, {}, None, None, None) - if entity is None: - entity = init_ner_model(self.vocab) - self.vocab = vocab - self.tokenizer = tokenizer - self.entity = entity - self.pipeline = [self.entity] - - def __call__(self, input_): - doc = self.make_doc(input_) - for process in self.pipeline: - process(doc) - return doc - - def make_doc(self, input_): - if isinstance(input_, bytes): - input_ = input_.decode('utf8') - if isinstance(input_, unicode): - return self.tokenizer(input_) - else: - return Doc(self.vocab, words=input_) - - def make_gold(self, input_, annotations): - doc = self.make_doc(input_) - gold = GoldParse(doc, entities=annotations) - return gold - - def update(self, input_, annot): - doc = self.make_doc(input_) - gold = self.make_gold(input_, annot) - for ner in gold.ner: - if ner not in (None, '-', 'O'): - action, label = ner.split('-', 1) - self.entity.add_label(label) - return self.entity.update(doc, gold) - - def evaluate(self, examples): - scorer = Scorer() - for input_, annot in examples: - gold = self.make_gold(input_, annot) - doc = self(input_) - scorer.score(doc, gold) - return scorer.scores - - def average_weights(self): - self.entity.model.end_training() - - def save(self, path): - path = Path(path) - if not path.exists(): - path.mkdir() - elif not path.is_dir(): - raise IOError("Can't save pipeline to %s\nNot a directory" % path) - save_vocab(self.vocab, path / 'vocab') - save_ner_model(self.entity, path / 'ner') - - -def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): - next_epoch = train_examples - print("Iter", "Loss", "P", "R", "F") - for i in range(nr_epoch): - this_epoch = next_epoch - next_epoch = [] - loss = 0 - for input_, annot in this_epoch: - loss += nlp.update(input_, annot) - if (i+1) < nr_epoch: - next_epoch.append((input_, annot)) - random.shuffle(next_epoch) - scores = nlp.evaluate(dev_examples) - report_scores(i, loss, scores) - nlp.average_weights() - scores = nlp.evaluate(dev_examples) - report_scores(channels, i+1, loss, scores) - - -def report_scores(i, loss, scores): - precision = '%.2f' % scores['ents_p'] - recall = '%.2f' % scores['ents_r'] - f_measure = '%.2f' % scores['ents_f'] - print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) - - -def read_examples(path): - path = Path(path) - with path.open() as file_: - sents = file_.read().strip().split('\n\n') - for sent in sents: - if not sent.strip(): - continue - tokens = sent.split('\n') - while tokens and tokens[0].startswith('#'): - tokens.pop(0) - words = [] - iob = [] - for token in tokens: - if token.strip(): - pieces = token.split() - words.append(pieces[1]) - iob.append(pieces[2]) - yield words, iob_to_biluo(iob) - - -@plac.annotations( - model_dir=("Path to save the model", "positional", None, Path), - train_loc=("Path to your training data", "positional", None, Path), - dev_loc=("Path to your development data", "positional", None, Path), -) -def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), - train_loc=None, dev_loc=None, nr_epoch=30): - - train_examples = read_examples(train_loc) - dev_examples = read_examples(dev_loc) - nlp = Pipeline.load(model_dir) - - train(nlp, train_examples, list(dev_examples), ctx, nr_epoch) - - nlp.save(model_dir) - - -if __name__ == '__main__': - main() diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4eae11c75..ec1e562c6 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -21,111 +21,120 @@ After training your model, you can save it to a directory. We recommend wrapping models as Python packages, for ease of deployment. For more details, see the documentation: -* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner -* Saving and loading models: https://spacy.io/docs/usage/saving-loading +* Training: https://alpha.spacy.io/usage/training +* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities -Developed for: spaCy 1.7.6 -Last tested for: spaCy 1.7.6 +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 """ from __future__ import unicode_literals, print_function +import plac import random from pathlib import Path -import random import spacy -from spacy.gold import GoldParse -from spacy.tagger import Tagger +from spacy.gold import GoldParse, minibatch -def train_ner(nlp, train_data, output_dir): - # Add new words to vocab - for raw_text, _ in train_data: - doc = nlp.make_doc(raw_text) - for word in doc: - _ = nlp.vocab[word.orth] - random.seed(0) - # You may need to change the learning rate. It's generally difficult to - # guess what rate you should set, especially when you have limited data. - nlp.entity.model.learn_rate = 0.001 - for itn in range(1000): - random.shuffle(train_data) - loss = 0. - for raw_text, entity_offsets in train_data: - gold = GoldParse(doc, entities=entity_offsets) - # By default, the GoldParse class assumes that the entities - # described by offset are complete, and all other words should - # have the tag 'O'. You can tell it to make no assumptions - # about the tag of a word by giving it the tag '-'. - # However, this allows a trivial solution to the current - # learning problem: if words are either 'any tag' or 'ANIMAL', - # the model can learn that all words can be tagged 'ANIMAL'. - #for i in range(len(gold.ner)): - #if not gold.ner[i].endswith('ANIMAL'): - # gold.ner[i] = '-' - doc = nlp.make_doc(raw_text) - nlp.tagger(doc) - # As of 1.9, spaCy's parser now lets you supply a dropout probability - # This might help the model generalize better from only a few - # examples. - loss += nlp.entity.update(doc, gold, drop=0.9) - if loss == 0: - break - # This step averages the model's weights. This may or may not be good for - # your situation --- it's empirical. - nlp.end_training() - if output_dir: - if not output_dir.exists(): - output_dir.mkdir() - nlp.save_to_directory(output_dir) +# new entity label +LABEL = 'ANIMAL' + +# training data +TRAIN_DATA = [ + ("Horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + + ("Do they bite?", []), + + ("horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + + ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]), + + ("they pretend to care about your feelings, those horses", + [(48, 54, 'ANIMAL')]), + + ("horses?", [(0, 6, 'ANIMAL')]) +] -def main(model_name, output_directory=None): - print("Loading initial model", model_name) - nlp = spacy.load(model_name) - if output_directory is not None: - output_directory = Path(output_directory) +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + new_model_name=("New model name for model meta.", "option", "nm", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): + """Set up the pipeline and entity recognizer, and train the new entity.""" + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") - train_data = [ - ( - "Horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')], - ), - ( - "horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')] - ), - ( - "horses pretend to care about your feelings", - [(0, 6, 'ANIMAL')] - ), - ( - "they pretend to care about your feelings, those horses", - [(48, 54, 'ANIMAL')] - ), - ( - "horses?", - [(0, 6, 'ANIMAL')] - ) + # Add entity recognizer to model if it's not in the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'ner' not in nlp.pipe_names: + ner = nlp.create_pipe('ner') + nlp.add_pipe(ner) + # otherwise, get it, so we can add labels to it + else: + ner = nlp.get_pipe('ner') - ] - nlp.entity.add_label('ANIMAL') - train_ner(nlp, train_data, output_directory) + ner.add_label(LABEL) # add new entity label to entity recognizer - # Test that the entity is recognized - doc = nlp('Do you like horses?') - print("Ents in 'Do you like horses?':") + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] + with nlp.disable_pipes(*other_pipes): # only train NER + random.seed(0) + optimizer = nlp.begin_training(lambda: []) + for itn in range(n_iter): + losses = {} + gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA) + for batch in minibatch(gold_parses, size=3): + docs, golds = zip(*batch) + nlp.update(docs, golds, losses=losses, sgd=optimizer, + drop=0.35) + print(losses) + + # test the trained model + test_text = 'Do you like horses?' + doc = nlp(test_text) + print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) - if output_directory: - print("Loading from", output_directory) - nlp2 = spacy.load('en', path=output_directory) - nlp2.entity.add_label('ANIMAL') - doc2 = nlp2('Do you like horses?') + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.meta['name'] = new_model_name # rename model + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text) +def get_gold_parses(tokenizer, train_data): + """Shuffle and create GoldParse objects. + + tokenizer (Tokenizer): Tokenizer to processs the raw text. + train_data (list): The training data. + YIELDS (tuple): (doc, gold) tuples. + """ + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = tokenizer(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + yield doc, gold + + if __name__ == '__main__': - import plac plac.call(main) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 8c3119704..a23d73ec7 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -1,75 +1,109 @@ +#!/usr/bin/env python +# coding: utf8 +""" +Example of training spaCy dependency parser, starting off with an existing model +or a blank model. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals, print_function -import json -import pathlib + +import plac import random +from pathlib import Path import spacy -from spacy.pipeline import DependencyParser from spacy.gold import GoldParse from spacy.tokens import Doc -def train_parser(nlp, train_data, left_labels, right_labels): - parser = DependencyParser( - nlp.vocab, - left_labels=left_labels, - right_labels=right_labels) - for itn in range(1000): - random.shuffle(train_data) - loss = 0 - for words, heads, deps in train_data: - doc = Doc(nlp.vocab, words=words) - gold = GoldParse(doc, heads=heads, deps=deps) - loss += parser.update(doc, gold) - parser.model.end_training() - return parser +# training data +TRAIN_DATA = [ + ( + ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], + [1, 1, 4, 4, 5, 1, 1], + ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] + ), + ( + ['I', 'like', 'London', 'and', 'Berlin', '.'], + [1, 1, 1, 2, 2, 1], + ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] + ) +] -def main(model_dir=None): - if model_dir is not None: - model_dir = pathlib.Path(model_dir) - if not model_dir.exists(): - model_dir.mkdir() - assert model_dir.is_dir() +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, output_dir=None, n_iter=1000): + """Load the model, set up the pipeline and train the parser.""" + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") - nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False) + # add the parser to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'parser' not in nlp.pipe_names: + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser, first=True) + # otherwise, get it, so we can add labels to it + else: + parser = nlp.get_pipe('parser') - train_data = [ - ( - ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], - [1, 1, 4, 4, 5, 1, 1], - ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] - ), - ( - ['I', 'like', 'London', 'and', 'Berlin', '.'], - [1, 1, 1, 2, 2, 1], - ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] - ) - ] - left_labels = set() - right_labels = set() - for _, heads, deps in train_data: - for i, (head, dep) in enumerate(zip(heads, deps)): - if i < head: - left_labels.add(dep) - elif i > head: - right_labels.add(dep) - parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels)) + # add labels to the parser + for _, _, deps in TRAIN_DATA: + for dep in deps: + parser.add_label(dep) - doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.']) - parser(doc) - for word in doc: - print(word.text, word.dep_, word.head.text) + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] + with nlp.disable_pipes(*other_pipes): # only train parser + optimizer = nlp.begin_training(lambda: []) + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, heads, deps in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, heads=heads, deps=deps) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) - if model_dir is not None: - with (model_dir / 'config.json').open('w') as file_: - json.dump(parser.cfg, file_) - parser.model.dump(str(model_dir / 'model')) + # test the trained model + test_text = "I like securities." + doc = nlp(test_text) + print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc = nlp2(test_text) + print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) if __name__ == '__main__': - main() - # I nsubj like - # like ROOT like - # securities dobj like - # . cc securities + plac.call(main) + + # expected result: + # [ + # ('I', 'nsubj', 'like'), + # ('like', 'ROOT', 'like'), + # ('securities', 'dobj', 'like'), + # ('.', 'punct', 'like') + # ] diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index d5a519942..c6fc1de88 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -1,18 +1,28 @@ -"""A quick example for training a part-of-speech tagger, without worrying -about the tokenization, or other language-specific customizations.""" +#!/usr/bin/env python +# coding: utf8 +""" +A simple example for training a part-of-speech tagger with a custom tag map. +To allow us to update the tag map with our custom one, this example starts off +with a blank Language class and modifies its defaults. -from __future__ import unicode_literals -from __future__ import print_function +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function import plac +import random from pathlib import Path -from spacy.vocab import Vocab -from spacy.tagger import Tagger +import spacy +from spacy.util import get_lang_class from spacy.tokens import Doc from spacy.gold import GoldParse -import random # You need to define a mapping from your data's part-of-speech tag names to the # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. @@ -28,54 +38,67 @@ TAG_MAP = { # Usually you'll read this in, of course. Data formats vary. # Ensure your strings are unicode. -DATA = [ - ( - ["I", "like", "green", "eggs"], - ["N", "V", "J", "N"] - ), - ( - ["Eat", "blue", "ham"], - ["V", "J", "N"] - ) +TRAIN_DATA = [ + (["I", "like", "green", "eggs"], ["N", "V", "J", "N"]), + (["Eat", "blue", "ham"], ["V", "J", "N"]) ] -def ensure_dir(path): - if not path.exists(): - path.mkdir() +@plac.annotations( + lang=("ISO Code of language to use", "option", "l", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(lang='en', output_dir=None, n_iter=25): + """Create a new model, set up the pipeline and train the tagger. In order to + train the tagger with a custom tag map, we're creating a new Language + instance with a custom vocab. + """ + lang_cls = get_lang_class(lang) # get Language class + lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults + nlp = lang_cls() # initialise Language class + # add the tagger to the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy + tagger = nlp.create_pipe('tagger') + nlp.add_pipe(tagger) -def main(output_dir=None): + optimizer = nlp.begin_training(lambda: []) + for i in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + for words, tags in TRAIN_DATA: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, tags=tags) + nlp.update([doc], [gold], sgd=optimizer, losses=losses) + print(losses) + + # test the trained model + test_text = "I like blue eggs" + doc = nlp(test_text) + print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) + + # save model to output directory if output_dir is not None: output_dir = Path(output_dir) - ensure_dir(output_dir) - ensure_dir(output_dir / "pos") - ensure_dir(output_dir / "vocab") + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) - vocab = Vocab(tag_map=TAG_MAP) - # The default_templates argument is where features are specified. See - # spacy/tagger.pyx for the defaults. - tagger = Tagger(vocab) - for i in range(25): - for words, tags in DATA: - doc = Doc(vocab, words=words) - gold = GoldParse(doc, tags=tags) - tagger.update(doc, gold) - random.shuffle(DATA) - tagger.model.end_training() - doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4)) - tagger(doc) - for word in doc: - print(word.text, word.tag_, word.pos_) - if output_dir is not None: - tagger.model.dump(str(output_dir / 'pos' / 'model')) - with (output_dir / 'vocab' / 'strings.json').open('w') as file_: - tagger.vocab.strings.dump(file_) + # test the save model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc = nlp2(test_text) + print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) if __name__ == '__main__': plac.call(main) - # I V VERB - # like V VERB - # blue N NOUN - # eggs N NOUN + + # Expected output: + # [ + # ('I', 'N', 'NOUN'), + # ('like', 'V', 'VERB'), + # ('blue', 'J', 'ADJ'), + # ('eggs', 'N', 'NOUN') + # ] diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index eefae111f..1f9cd29aa 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -1,108 +1,136 @@ -from __future__ import unicode_literals +#!/usr/bin/env python +# coding: utf8 +"""Train a multi-label convolutional neural network text classifier on the +IMDB dataset, using the TextCategorizer component. The dataset will be loaded +automatically via Thinc's built-in dataset loader. The model is added to +spacy.pipeline, and predictions are available via `doc.cats`. + +For more details, see the documentation: +* Training: https://alpha.spacy.io/usage/training +* Text classification: https://alpha.spacy.io/usage/text-classification + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function import plac import random -import tqdm - -from thinc.neural.optimizers import Adam -from thinc.neural.ops import NumpyOps +from pathlib import Path import thinc.extra.datasets -import spacy.lang.en +import spacy from spacy.gold import GoldParse, minibatch from spacy.util import compounding from spacy.pipeline import TextCategorizer -def train_textcat(tokenizer, textcat, - train_texts, train_cats, dev_texts, dev_cats, - n_iter=20): - ''' - Train the TextCategorizer without associated pipeline. - ''' - textcat.begin_training() - optimizer = Adam(NumpyOps(), 0.001) - train_docs = [tokenizer(text) for text in train_texts] +@plac.annotations( + model=("Model name. Defaults to blank 'en' model.", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int)) +def main(model=None, output_dir=None, n_iter=20): + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + nlp = spacy.blank('en') # create blank Language class + print("Created blank 'en' model") + + # add the text classifier to the pipeline if it doesn't exist + # nlp.create_pipe works for built-ins that are registered with spaCy + if 'textcat' not in nlp.pipe_names: + # textcat = nlp.create_pipe('textcat') + textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE']) + nlp.add_pipe(textcat, last=True) + # otherwise, get it, so we can add labels to it + else: + textcat = nlp.get_pipe('textcat') + + # add label to text classifier + # textcat.add_label('POSITIVE') + + # load the IMBD dataset + print("Loading IMDB data...") + (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) + train_docs = [nlp.tokenizer(text) for text in train_texts] train_gold = [GoldParse(doc, cats=cats) for doc, cats in zip(train_docs, train_cats)] - train_data = zip(train_docs, train_gold) - batch_sizes = compounding(4., 128., 1.001) - for i in range(n_iter): - losses = {} - train_data = tqdm.tqdm(train_data, leave=False) # Progress bar - for batch in minibatch(train_data, size=batch_sizes): - docs, golds = zip(*batch) - textcat.update((docs, None), golds, sgd=optimizer, drop=0.2, - losses=losses) - with textcat.model.use_params(optimizer.averages): - scores = evaluate(tokenizer, textcat, dev_texts, dev_cats) - yield losses['textcat'], scores + train_data = list(zip(train_docs, train_gold)) + + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] + with nlp.disable_pipes(*other_pipes): # only train textcat + optimizer = nlp.begin_training(lambda: []) + print("Training the model...") + print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) + for i in range(n_iter): + losses = {} + # batch up the examples using spaCy's minibatch + batches = minibatch(train_data, size=compounding(4., 128., 1.001)) + for batch in batches: + docs, golds = zip(*batch) + nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) + with textcat.model.use_params(optimizer.averages): + # evaluate on the dev data split off in load_data() + scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) + print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}' # print a simple table + .format(losses['textcat'], scores['textcat_p'], + scores['textcat_r'], scores['textcat_f'])) + + # test the trained model + test_text = "This movie sucked" + doc = nlp(test_text) + print(test_text, doc.cats) + + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + doc2 = nlp2(test_text) + print(test_text, doc2.cats) + + +def load_data(limit=0, split=0.8): + """Load data from the IMDB dataset.""" + # Partition off part of the train data for evaluation + train_data, _ = thinc.extra.datasets.imdb() + random.shuffle(train_data) + train_data = train_data[-limit:] + texts, labels = zip(*train_data) + cats = [{'POSITIVE': bool(y)} for y in labels] + split = int(len(train_data) * split) + return (texts[:split], cats[:split]), (texts[split:], cats[split:]) def evaluate(tokenizer, textcat, texts, cats): docs = (tokenizer(text) for text in texts) - tp = 1e-8 # True positives - fp = 1e-8 # False positives - fn = 1e-8 # False negatives - tn = 1e-8 # True negatives + tp = 1e-8 # True positives + fp = 1e-8 # False positives + fn = 1e-8 # False negatives + tn = 1e-8 # True negatives for i, doc in enumerate(textcat.pipe(docs)): gold = cats[i] for label, score in doc.cats.items(): - if score >= 0.5 and label in gold: + if label not in gold: + continue + if score >= 0.5 and gold[label] >= 0.5: tp += 1. - elif score >= 0.5 and label not in gold: + elif score >= 0.5 and gold[label] < 0.5: fp += 1. - elif score < 0.5 and label not in gold: + elif score < 0.5 and gold[label] < 0.5: tn += 1 - if score < 0.5 and label in gold: + elif score < 0.5 and gold[label] >= 0.5: fn += 1 - precis = tp / (tp + fp) + precision = tp / (tp + fp) recall = tp / (tp + fn) - fscore = 2 * (precis * recall) / (precis + recall) - return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore} - - -def load_data(): - # Partition off part of the train data --- avoid running experiments - # against test. - train_data, _ = thinc.extra.datasets.imdb() - - random.shuffle(train_data) - - texts, labels = zip(*train_data) - cats = [(['POSITIVE'] if y else []) for y in labels] - - split = int(len(train_data) * 0.8) - - train_texts = texts[:split] - train_cats = cats[:split] - dev_texts = texts[split:] - dev_cats = cats[split:] - return (train_texts, train_cats), (dev_texts, dev_cats) - - -def main(model_loc=None): - nlp = spacy.lang.en.English() - tokenizer = nlp.tokenizer - textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) - - print("Load IMDB data") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data() - - print("Itn.\tLoss\tP\tR\tF") - progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' - - for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat, - train_texts, train_cats, - dev_texts, dev_cats, n_iter=20)): - print(progress.format(i=i, loss=loss, **scores)) - # How to save, load and use - nlp.pipeline.append(textcat) - if model_loc is not None: - nlp.to_disk(model_loc) - - nlp = spacy.load(model_loc) - doc = nlp(u'This movie sucked!') - print(doc.cats) + f_score = 2 * (precision * recall) / (precision + recall) + return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score} if __name__ == '__main__': diff --git a/examples/training/training-data.json b/examples/training/training-data.json new file mode 100644 index 000000000..7737b9a14 --- /dev/null +++ b/examples/training/training-data.json @@ -0,0 +1,641 @@ +[ + { + "id": "wsj_0200", + "paragraphs": [ + { + "raw": "In an Oct. 19 review of \"The Misanthrope\" at Chicago's Goodman Theatre (\"Revitalized Classics Take the Stage in Windy City,\" Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag. Ms. Haag plays Elianti.", + "sentences": [ + { + "tokens": [ + { + "head": 44, + "dep": "prep", + "tag": "IN", + "orth": "In", + "ner": "O", + "id": 0 + }, + { + "head": 3, + "dep": "det", + "tag": "DT", + "orth": "an", + "ner": "O", + "id": 1 + }, + { + "head": 2, + "dep": "nmod", + "tag": "NNP", + "orth": "Oct.", + "ner": "B-DATE", + "id": 2 + }, + { + "head": -1, + "dep": "nummod", + "tag": "CD", + "orth": "19", + "ner": "L-DATE", + "id": 3 + }, + { + "head": -4, + "dep": "pobj", + "tag": "NN", + "orth": "review", + "ner": "O", + "id": 4 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "of", + "ner": "O", + "id": 5 + }, + { + "head": 2, + "dep": "punct", + "tag": "``", + "orth": "``", + "ner": "O", + "id": 6 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "The", + "ner": "B-WORK_OF_ART", + "id": 7 + }, + { + "head": -3, + "dep": "pobj", + "tag": "NN", + "orth": "Misanthrope", + "ner": "L-WORK_OF_ART", + "id": 8 + }, + { + "head": -1, + "dep": "punct", + "tag": "''", + "orth": "''", + "ner": "O", + "id": 9 + }, + { + "head": -2, + "dep": "prep", + "tag": "IN", + "orth": "at", + "ner": "O", + "id": 10 + }, + { + "head": 3, + "dep": "poss", + "tag": "NNP", + "orth": "Chicago", + "ner": "U-GPE", + "id": 11 + }, + { + "head": -1, + "dep": "case", + "tag": "POS", + "orth": "'s", + "ner": "O", + "id": 12 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Goodman", + "ner": "B-FAC", + "id": 13 + }, + { + "head": -4, + "dep": "pobj", + "tag": "NNP", + "orth": "Theatre", + "ner": "L-FAC", + "id": 14 + }, + { + "head": 4, + "dep": "punct", + "tag": "-LRB-", + "orth": "(", + "ner": "O", + "id": 15 + }, + { + "head": 3, + "dep": "punct", + "tag": "``", + "orth": "``", + "ner": "O", + "id": 16 + }, + { + "head": 1, + "dep": "amod", + "tag": "VBN", + "orth": "Revitalized", + "ner": "B-WORK_OF_ART", + "id": 17 + }, + { + "head": 1, + "dep": "nsubj", + "tag": "NNS", + "orth": "Classics", + "ner": "I-WORK_OF_ART", + "id": 18 + }, + { + "head": -15, + "dep": "appos", + "tag": "VBP", + "orth": "Take", + "ner": "I-WORK_OF_ART", + "id": 19 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "the", + "ner": "I-WORK_OF_ART", + "id": 20 + }, + { + "head": -2, + "dep": "dobj", + "tag": "NN", + "orth": "Stage", + "ner": "I-WORK_OF_ART", + "id": 21 + }, + { + "head": -3, + "dep": "prep", + "tag": "IN", + "orth": "in", + "ner": "I-WORK_OF_ART", + "id": 22 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Windy", + "ner": "I-WORK_OF_ART", + "id": 23 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "City", + "ner": "L-WORK_OF_ART", + "id": 24 + }, + { + "head": -6, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 25 + }, + { + "head": -7, + "dep": "punct", + "tag": "''", + "orth": "''", + "ner": "O", + "id": 26 + }, + { + "head": -8, + "dep": "npadvmod", + "tag": "NN", + "orth": "Leisure", + "ner": "B-ORG", + "id": 27 + }, + { + "head": -1, + "dep": "cc", + "tag": "CC", + "orth": "&", + "ner": "I-ORG", + "id": 28 + }, + { + "head": -2, + "dep": "conj", + "tag": "NNS", + "orth": "Arts", + "ner": "L-ORG", + "id": 29 + }, + { + "head": -11, + "dep": "punct", + "tag": "-RRB-", + "orth": ")", + "ner": "O", + "id": 30 + }, + { + "head": 13, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 31 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "the", + "ner": "O", + "id": 32 + }, + { + "head": 11, + "dep": "nsubjpass", + "tag": "NN", + "orth": "role", + "ner": "O", + "id": 33 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "of", + "ner": "O", + "id": 34 + }, + { + "head": -1, + "dep": "pobj", + "tag": "NNP", + "orth": "Celimene", + "ner": "U-PERSON", + "id": 35 + }, + { + "head": -3, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 36 + }, + { + "head": -4, + "dep": "acl", + "tag": "VBN", + "orth": "played", + "ner": "O", + "id": 37 + }, + { + "head": -1, + "dep": "agent", + "tag": "IN", + "orth": "by", + "ner": "O", + "id": 38 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Kim", + "ner": "B-PERSON", + "id": 39 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "Cattrall", + "ner": "L-PERSON", + "id": 40 + }, + { + "head": -8, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 41 + }, + { + "head": 2, + "dep": "auxpass", + "tag": "VBD", + "orth": "was", + "ner": "O", + "id": 42 + }, + { + "head": 1, + "dep": "advmod", + "tag": "RB", + "orth": "mistakenly", + "ner": "O", + "id": 43 + }, + { + "head": 0, + "dep": "root", + "tag": "VBN", + "orth": "attributed", + "ner": "O", + "id": 44 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "to", + "ner": "O", + "id": 45 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Christina", + "ner": "B-PERSON", + "id": 46 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "Haag", + "ner": "L-PERSON", + "id": 47 + }, + { + "head": -4, + "dep": "punct", + "tag": ".", + "orth": ".", + "ner": "O", + "id": 48 + } + ], + "brackets": [ + { + "first": 2, + "last": 3, + "label": "NML" + }, + { + "first": 1, + "last": 4, + "label": "NP" + }, + { + "first": 7, + "last": 8, + "label": "NP-TTL" + }, + { + "first": 11, + "last": 12, + "label": "NP" + }, + { + "first": 11, + "last": 14, + "label": "NP" + }, + { + "first": 10, + "last": 14, + "label": "PP-LOC" + }, + { + "first": 6, + "last": 14, + "label": "NP" + }, + { + "first": 5, + "last": 14, + "label": "PP" + }, + { + "first": 1, + "last": 14, + "label": "NP" + }, + { + "first": 17, + "last": 18, + "label": "NP-SBJ" + }, + { + "first": 20, + "last": 21, + "label": "NP" + }, + { + "first": 23, + "last": 24, + "label": "NP" + }, + { + "first": 22, + "last": 24, + "label": "PP-LOC" + }, + { + "first": 19, + "last": 24, + "label": "VP" + }, + { + "first": 17, + "last": 24, + "label": "S-HLN" + }, + { + "first": 27, + "last": 29, + "label": "NP-TMP" + }, + { + "first": 15, + "last": 30, + "label": "NP" + }, + { + "first": 1, + "last": 30, + "label": "NP" + }, + { + "first": 0, + "last": 30, + "label": "PP-LOC" + }, + { + "first": 32, + "last": 33, + "label": "NP" + }, + { + "first": 35, + "last": 35, + "label": "NP" + }, + { + "first": 34, + "last": 35, + "label": "PP" + }, + { + "first": 32, + "last": 35, + "label": "NP" + }, + { + "first": 39, + "last": 40, + "label": "NP-LGS" + }, + { + "first": 38, + "last": 40, + "label": "PP" + }, + { + "first": 37, + "last": 40, + "label": "VP" + }, + { + "first": 32, + "last": 41, + "label": "NP-SBJ-2" + }, + { + "first": 43, + "last": 43, + "label": "ADVP-MNR" + }, + { + "first": 46, + "last": 47, + "label": "NP" + }, + { + "first": 45, + "last": 47, + "label": "PP-CLR" + }, + { + "first": 44, + "last": 47, + "label": "VP" + }, + { + "first": 42, + "last": 47, + "label": "VP" + }, + { + "first": 0, + "last": 48, + "label": "S" + } + ] + }, + { + "tokens": [ + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Ms.", + "ner": "O", + "id": 0 + }, + { + "head": 1, + "dep": "nsubj", + "tag": "NNP", + "orth": "Haag", + "ner": "U-PERSON", + "id": 1 + }, + { + "head": 0, + "dep": "root", + "tag": "VBZ", + "orth": "plays", + "ner": "O", + "id": 2 + }, + { + "head": -1, + "dep": "dobj", + "tag": "NNP", + "orth": "Elianti", + "ner": "U-PERSON", + "id": 3 + }, + { + "head": -2, + "dep": "punct", + "tag": ".", + "orth": ".", + "ner": "O", + "id": 4 + } + ], + "brackets": [ + { + "first": 0, + "last": 1, + "label": "NP-SBJ" + }, + { + "first": 3, + "last": 3, + "label": "NP" + }, + { + "first": 2, + "last": 3, + "label": "VP" + }, + { + "first": 0, + "last": 4, + "label": "S" + } + ] + } + ] + } + ] + } + ] diff --git a/examples/training/vocab-data.jsonl b/examples/training/vocab-data.jsonl new file mode 100644 index 000000000..2f129dd30 --- /dev/null +++ b/examples/training/vocab-data.jsonl @@ -0,0 +1,21 @@ +{"lang": "en", "settings": {"oov_prob": -20.502029418945312}} +{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true} +{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} +{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false} diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py deleted file mode 100644 index b6e4e4e83..000000000 --- a/examples/twitter_filter.py +++ /dev/null @@ -1,36 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function -import plac -import codecs -import pathlib -import random - -import twython -import spacy.en - -import _handler - - -class Connection(twython.TwythonStreamer): - def __init__(self, keys_dir, nlp, query): - keys_dir = pathlib.Path(keys_dir) - read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip() - api_key = map(read, ['key', 'secret', 'token', 'token_secret']) - twython.TwythonStreamer.__init__(self, *api_key) - self.nlp = nlp - self.query = query - - def on_success(self, data): - _handler.handle_tweet(self.nlp, data, self.query) - if random.random() >= 0.1: - reload(_handler) - - -def main(keys_dir, term): - nlp = spacy.en.English() - twitter = Connection(keys_dir, nlp, term) - twitter.statuses.filter(track=term, language='en') - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py new file mode 100644 index 000000000..159250098 --- /dev/null +++ b/examples/vectors_fast_text.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# coding: utf8 +"""Load vectors for a language trained using fastText +https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md +""" +from __future__ import unicode_literals +import plac +import numpy + +import from spacy.language import Language + + +@plac.annotations( + vectors_loc=("Path to vectors", "positional", None, str)) +def main(vectors_loc): + nlp = Language() + + with open(vectors_loc, 'rb') as file_: + header = file_.readline() + nr_row, nr_dim = header.split() + nlp.vocab.clear_vectors(int(nr_dim)) + for line in file_: + line = line.decode('utf8') + pieces = line.split() + word = pieces[0] + vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') + nlp.vocab.set_vector(word, vector) + doc = nlp(u'class colspan') + print(doc[0].similarity(doc[1])) + + +if __name__ == '__main__': + plac.call(main) diff --git a/fabfile.py b/fabfile.py index cfa80ead5..2894fe477 100644 --- a/fabfile.py +++ b/fabfile.py @@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV) def env(lang='python2.7'): if path.exists(VENV_DIR): local('rm -rf {env}'.format(env=VENV_DIR)) + local('pip install virtualenv') local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) @@ -32,6 +33,10 @@ def make(): local('pip install -r requirements.txt') local('python setup.py build_ext --inplace') +def sdist(): + with virtualenv(VENV_DIR): + with lcd(path.dirname(__file__)): + local('python setup.py sdist') def clean(): with lcd(path.dirname(__file__)): diff --git a/requirements.txt b/requirements.txt index aae0f9388..01e41c993 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -cython<0.24 +cython>=0.24,<0.27.0 pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.8.0,<6.9.0 +thinc>=6.10.0,<6.11.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six @@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0 regex==2017.4.5 ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 -pip>=9.0.0,<10.0.0 mock>=2.0.0,<3.0.0 msgpack-python msgpack-numpy +html5lib==1.0b8 diff --git a/setup.py b/setup.py index 6a22f4076..727df5e4e 100755 --- a/setup.py +++ b/setup.py @@ -24,25 +24,19 @@ MOD_NAMES = [ 'spacy.vocab', 'spacy.attrs', 'spacy.morphology', - 'spacy.tagger', 'spacy.pipeline', 'spacy.syntax.stateclass', 'spacy.syntax._state', 'spacy.syntax._beam_utils', 'spacy.tokenizer', - 'spacy._cfile', - 'spacy.syntax.parser', 'spacy.syntax.nn_parser', - 'spacy.syntax.beam_parser', 'spacy.syntax.nonproj', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', - 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', - 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', @@ -53,7 +47,8 @@ MOD_NAMES = [ COMPILE_OPTIONS = { 'msvc': ['/Ox', '/EHsc'], 'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'], - 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] + 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function', + '-march=native'] } @@ -66,7 +61,7 @@ LINK_OPTIONS = { # I don't understand this very well yet. See Issue #267 # Fingers crossed! -USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None +USE_OPENMP_DEFAULT = '0' if sys.platform != 'darwin' else None if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1': if sys.platform == 'darwin': COMPILE_OPTIONS['other'].append('-fopenmp') @@ -195,9 +190,8 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.8.0,<6.9.0', + 'thinc>=6.10.0,<6.11.0', 'plac<1.0.0,>=0.9.6', - 'pip>=9.0.0,<10.0.0', 'six', 'pathlib', 'ujson>=1.35', diff --git a/spacy/__init__.py b/spacy/__init__.py index 1cb7c0cbd..9acc566ad 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals from .cli.info import info as cli_info from .glossary import explain -from .deprecated import resolve_load_name from .about import __version__ from . import util def load(name, **overrides): + from .deprecated import resolve_load_name name = resolve_load_name(name, **overrides) return util.load_model(name, **overrides) diff --git a/spacy/__main__.py b/spacy/__main__.py index d02242d68..f4b5e6715 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,13 +1,13 @@ # coding: utf8 from __future__ import print_function # NB! This breaks in plac on Python 2!! -#from __future__ import unicode_literals +# from __future__ import unicode_literals if __name__ == '__main__': import plac import sys from spacy.cli import download, link, info, package, train, convert, model - from spacy.cli import profile + from spacy.cli import vocab, profile, evaluate, validate from spacy.util import prints commands = { @@ -15,10 +15,13 @@ if __name__ == '__main__': 'link': link, 'info': info, 'train': train, + 'evaluate': evaluate, 'convert': convert, 'package': package, 'model': model, + 'vocab': vocab, 'profile': profile, + 'validate': validate } if len(sys.argv) == 1: prints(', '.join(commands), title="Available commands", exits=1) diff --git a/spacy/_cfile.pxd b/spacy/_cfile.pxd deleted file mode 100644 index cb0077587..000000000 --- a/spacy/_cfile.pxd +++ /dev/null @@ -1,26 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef bint is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * - - - -cdef class StringCFile(CFile): - cdef unsigned char* data - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/_cfile.pyx b/spacy/_cfile.pyx deleted file mode 100644 index ceebe2e59..000000000 --- a/spacy/_cfile.pyx +++ /dev/null @@ -1,88 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from libc.string cimport memcpy - - -cdef class CFile: - def __init__(self, loc, mode, on_open_error=None): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - self.mem = Pool() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - if on_open_error is not None: - on_open_error() - else: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) - - -cdef class StringCFile: - def __init__(self, mode, bytes data=b'', on_open_error=None): - self.mem = Pool() - self.is_open = 'w' in mode - self._capacity = max(len(data), 8) - self.size = len(data) - self.data = self.mem.alloc(1, self._capacity) - for i in range(len(data)): - self.data[i] = data[i] - - def close(self): - self.is_open = False - - def string_data(self): - return (self.data-self.size)[:self.size] - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - memcpy(dest, self.data, elem_size * number) - self.data += elem_size * number - - cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: - write_size = number * elem_size - if (self.size + write_size) >= self._capacity: - self._capacity = (self.size + write_size) * 2 - self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, elem_size * number) - self.size += write_size - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) diff --git a/spacy/_ml.py b/spacy/_ml.py index 003541f4b..c99f840b7 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,43 +1,42 @@ -import ujson -from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.neural import Model, Maxout, Softmax, Affine -from thinc.neural._classes.hash_embed import HashEmbed -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module -import random -import cytoolz +# coding: utf8 +from __future__ import unicode_literals + +import numpy +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu +from thinc.i2v import HashEmbed, StaticVectors +from thinc.t2t import ExtractWindow, ParametricAttention +from thinc.t2v import Pooling, sum_pool +from thinc.misc import Residual +from thinc.misc import LayerNorm as LN +from thinc.api import add, layerize, chain, clone, concatenate, with_flatten +from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths +from thinc.api import uniqued, wrap, noop +from thinc.linear.linear import LinearModel +from thinc.neural.ops import NumpyOps, CupyOps +from thinc.neural.util import get_array_module, copy_array +from thinc.neural._lsuv import svd_orthonormal -from thinc.neural._classes.convolution import ExtractWindow -from thinc.neural._classes.static_vectors import StaticVectors -from thinc.neural._classes.batchnorm import BatchNorm as BN -from thinc.neural._classes.layernorm import LayerNorm as LN -from thinc.neural._classes.resnet import Residual -from thinc.neural import ReLu -from thinc.neural._classes.selu import SELU from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed -from thinc.api import FeatureExtracter, with_getitem -from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool -from thinc.neural._classes.attention import ParametricAttention -from thinc.linear.linear import LinearModel -from thinc.api import uniqued, wrap, flatten_add_lengths +import thinc.extra.load_nlp +from thinc.neural._lsuv import svd_orthonormal - -from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER -from .tokens.doc import Doc +from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE from . import util -import numpy -import io + +VECTORS_KEY = 'spacy_pretrained_vectors' @layerize def _flatten_add_lengths(seqs, pad=0, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') + def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=pad) + X = ops.flatten(seqs, pad=pad) return (X, lengths), finish_update @@ -51,33 +50,14 @@ def _logistic(X, drop=0.): X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) Y = 1. / (1. + xp.exp(-X)) + def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1-Y)) return dX + return Y, logistic_bwd -@layerize -def add_tuples(X, drop=0.): - """Give inputs of sequence pairs, where each sequence is (vals, length), - sum the values, returning a single sequence. - - If input is: - ((vals1, length), (vals2, length) - Output is: - (vals1+vals2, length) - - vals are a single tensor for the whole batch. - """ - (vals1, length1), (vals2, length2) = X - assert length1 == length2 - - def add_tuples_bwd(dY, sgd=None): - return (dY, dY) - - return (vals1+vals2, length), add_tuples_bwd - - def _zero_init(model): def _zero_init_impl(self, X, y): self.W.fill(0) @@ -90,7 +70,6 @@ def _zero_init(model): @layerize def _preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] - keys = [a[:, 0] for a in keys] ops = Model.ops lengths = ops.asarray([arr.shape[0] for arr in keys]) keys = ops.xp.concatenate(keys) @@ -98,78 +77,25 @@ def _preprocess_doc(docs, drop=0.): return (keys, vals, lengths), None -def _init_for_precomputed(W, ops): - if (W**2).sum() != 0.: - return - reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2])) - ops.xavier_uniform_init(reshaped) - W[:] = reshaped.reshape(W.shape) - - -@describe.on_data(_set_dimensions_if_needed) +@describe.on_data(_set_dimensions_if_needed, + lambda model, X, y: model.init_weights(model)) @describe.attributes( nI=Dimension("Input size"), nF=Dimension("Number of features"), nO=Dimension("Output size"), + nP=Dimension("Maxout pieces"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nI), - lambda W, ops: _init_for_precomputed(W, ops)), - b=Biases("Bias vector", - lambda obj: (obj.nO,)), - d_W=Gradient("W"), - d_b=Gradient("b") -) -class PrecomputableAffine(Model): - def __init__(self, nO=None, nI=None, nF=None, **kwargs): - Model.__init__(self, **kwargs) - self.nO = nO - self.nI = nI - self.nF = nF - - def begin_update(self, X, drop=0.): - # X: (b, i) - # Yf: (b, f, i) - # dY: (b, o) - # dYf: (b, f, o) - #Yf = numpy.einsum('bi,foi->bfo', X, self.W) - Yf = self.ops.xp.tensordot( - X, self.W, axes=[[1], [2]]) - Yf += self.b - def backward(dY_ids, sgd=None): - tensordot = self.ops.xp.tensordot - dY, ids = dY_ids - Xf = X[ids] - - #dXf = numpy.einsum('bo,foi->bfi', dY, self.W) - dXf = tensordot(dY, self.W, axes=[[1], [1]]) - #dW = numpy.einsum('bo,bfi->ofi', dY, Xf) - dW = tensordot(dY, Xf, axes=[[0], [0]]) - # ofi -> foi - self.d_W += dW.transpose((1, 0, 2)) - self.d_b += dY.sum(axis=0) - - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf - return Yf, backward - - -@describe.on_data(_set_dimensions_if_needed) -@describe.attributes( - nI=Dimension("Input size"), - nF=Dimension("Number of features"), - nP=Dimension("Number of pieces"), - nO=Dimension("Output size"), - W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI), - lambda W, ops: ops.xavier_uniform_init(W)), + lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)), + pad=Synapses("Pad", + lambda obj: (1, obj.nF, obj.nO, obj.nP), + lambda M, ops: ops.normal_init(M, 1.)), d_W=Gradient("W"), - d_b=Gradient("b") -) -class PrecomputableMaxouts(Model): - def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs): + d_pad=Gradient("pad"), + d_b=Gradient("b")) +class PrecomputableAffine(Model): + def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs): Model.__init__(self, **kwargs) self.nO = nO self.nP = nP @@ -177,142 +103,195 @@ class PrecomputableMaxouts(Model): self.nF = nF def begin_update(self, X, drop=0.): - # X: (b, i) - # Yfp: (b, f, o, p) - # Xf: (f, b, i) - # dYp: (b, o, p) - # W: (f, o, p, i) - # b: (o, p) + Yf = self.ops.xp.dot(X, + self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T) + Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP)) + Yf = self._add_padding(Yf) - # bi,opfi->bfop - # bop,fopi->bfi - # bop,fbi->opfi : fopi - - tensordot = self.ops.xp.tensordot - ascontiguous = self.ops.xp.ascontiguousarray - - Yfp = tensordot(X, self.W, axes=[[1], [3]]) - Yfp += self.b - - def backward(dYp_ids, sgd=None): - dYp, ids = dYp_ids + def backward(dY_ids, sgd=None): + dY, ids = dY_ids + dY, ids = self._backprop_padding(dY, ids) Xf = X[ids] + Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI)) - dXf = tensordot(dYp, self.W, axes=[[1, 2], [1,2]]) - dW = tensordot(dYp, Xf, axes=[[0], [0]]) + self.d_b += dY.sum(axis=0) + dY = dY.reshape((dY.shape[0], self.nO*self.nP)) - self.d_W += dW.transpose((2, 0, 1, 3)) - self.d_b += dYp.sum(axis=0) + Wopfi = self.W.transpose((1, 2, 0, 3)) + Wopfi = self.ops.xp.ascontiguousarray(Wopfi) + Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI)) + dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi) + + # Reuse the buffer + dWopfi = Wopfi; dWopfi.fill(0.) + self.ops.xp.dot(dY.T, Xf, out=dWopfi) + dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI)) + # (o, p, f, i) --> (f, o, p, i) + self.d_W += dWopfi.transpose((2, 0, 1, 3)) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf - return Yfp, backward + return dXf.reshape((dXf.shape[0], self.nF, self.nI)) + return Yf, backward + + def _add_padding(self, Yf): + Yf_padded = self.ops.xp.vstack((self.pad, Yf)) + return Yf_padded[1:] + def _backprop_padding(self, dY, ids): + for i in range(ids.shape[0]): + for j in range(ids.shape[1]): + if ids[i, j] < 0: + self.d_pad[0, j] += dY[i, j] + return dY, ids -def drop_layer(layer, factor=2.): - def drop_layer_fwd(X, drop=0.): - if drop <= 0.: - return layer.begin_update(X, drop=drop) - else: - coinflip = layer.ops.xp.random.random() - if (coinflip / factor) >= drop: - return layer.begin_update(X, drop=drop) + @staticmethod + def init_weights(model): + '''This is like the 'layer sequential unit variance', but instead + of taking the actual inputs, we randomly generate whitened data. + + Why's this all so complicated? We have a huge number of inputs, + and the maxout unit makes guessing the dynamics tricky. Instead + we set the maxout weights to values that empirically result in + whitened outputs given whitened inputs. + ''' + if (model.W**2).sum() != 0.: + return + model.ops.normal_init(model.W, model.nF * model.nI, inplace=True) + + ids = numpy.zeros((5000, model.nF), dtype='i') + ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i') + tokvecs = numpy.zeros((5000, model.nI), dtype='f') + tokvecs += numpy.random.normal(loc=0., scale=1., + size=tokvecs.size).reshape(tokvecs.shape) + + def predict(ids, tokvecs): + # nS ids. nW tokvecs + hiddens = model(tokvecs) # (nW, f, o, p) + # need nS vectors + vectors = model.ops.allocate((ids.shape[0], model.nO, model.nP)) + for i, feats in enumerate(ids): + for j, id_ in enumerate(feats): + vectors[i] += hiddens[id_, j] + vectors += model.b + if model.nP >= 2: + return model.ops.maxout(vectors)[0] else: - return X, lambda dX, sgd=None: dX + return vectors * (vectors >= 0) - model = wrap(drop_layer_fwd, layer) - model.predict = layer - return model + tol_var = 0.01 + tol_mean = 0.01 + t_max = 10 + t_i = 0 + for t_i in range(t_max): + acts1 = predict(ids, tokvecs) + var = numpy.var(acts1) + mean = numpy.mean(acts1) + if abs(var - 1.0) >= tol_var: + model.W /= numpy.sqrt(var) + elif abs(mean) >= tol_mean: + model.b -= mean + else: + break -def Tok2Vec(width, embed_size, preprocess=None): +def link_vectors_to_models(vocab): + vectors = vocab.vectors + ops = Model.ops + for word in vocab: + if word.orth in vectors.key2row: + word.rank = vectors.key2row[word.orth] + else: + word.rank = 0 + data = ops.asarray(vectors.data) + # Set an entry here, so that vectors are accessed by StaticVectors + # (unideal, I know) + thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data + + +def Tok2Vec(width, embed_size, **kwargs): + pretrained_dims = kwargs.get('pretrained_dims', 0) + cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): - norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') - prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') - suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') - shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') + with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, + '+': add, '*': reapply}): + norm = HashEmbed(width, embed_size, column=cols.index(NORM), + name='embed_norm') + prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), + name='embed_prefix') + suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), + name='embed_suffix') + shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), + name='embed_shape') + if pretrained_dims is not None and pretrained_dims >= 1: + glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) + + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> LN(Maxout(width, width*5, pieces=3)), column=5) + else: + embed = uniqued( + (norm | prefix | suffix | shape) + >> LN(Maxout(width, width*4, pieces=3)), column=5) + + convolution = Residual( + ExtractWindow(nW=1) + >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) + ) - embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) tok2vec = ( - with_flatten( - asarray(Model.ops, dtype='uint64') - >> uniqued(embed, column=5) - >> Residual( - (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) - ) ** 4, pad=4 + FeatureExtracter(cols) + >> with_flatten( + embed + >> convolution ** 4, pad=4 ) ) - if preprocess not in (False, None): - tok2vec = preprocess >> tok2vec # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed return tok2vec +def reapply(layer, n_times): + def reapply_fwd(X, drop=0.): + backprops = [] + for i in range(n_times): + Y, backprop = layer.begin_update(X, drop=drop) + X = Y + backprops.append(backprop) + + def reapply_bwd(dY, sgd=None): + dX = None + for backprop in reversed(backprops): + dY = backprop(dY, sgd=sgd) + if dX is None: + dX = dY + else: + dX += dY + return dX + + return Y, reapply_bwd + return wrap(reapply_fwd, layer) + + def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None return layerize(forward) -def foreach(layer): - def forward(Xs, drop=0.): - results = [] - backprops = [] - for X in Xs: - result, bp = layer.begin_update(X, drop=drop) - results.append(result) - backprops.append(bp) - def backward(d_results, sgd=None): - dXs = [] - for d_result, backprop in zip(d_results, backprops): - dXs.append(backprop(d_result, sgd)) - return dXs - return results, backward - model = layerize(forward) - model._layers.append(layer) - return model - - -def rebatch(size, layer): - ops = layer.ops - def forward(X, drop=0.): - if X.shape[0] < size: - return layer.begin_update(X) - parts = _divide_array(X, size) - results, bp_results = zip(*[layer.begin_update(p, drop=drop) - for p in parts]) - y = ops.flatten(results) - def backward(dy, sgd=None): - d_parts = [bp(y, sgd=sgd) for bp, y in - zip(bp_results, _divide_array(dy, size))] - try: - dX = ops.flatten(d_parts) - except TypeError: - dX = None - except ValueError: - dX = None - return dX - return y, backward - model = layerize(forward) - model._layers.append(layer) - return model - - def _divide_array(X, size): parts = [] index = 0 while index < len(X): - parts.append(X[index : index + size]) + parts.append(X[index:index + size]) index += size return parts def get_col(idx): assert idx >= 0, idx + def forward(X, drop=0.): assert idx >= 0, idx if isinstance(X, numpy.ndarray): @@ -320,30 +299,28 @@ def get_col(idx): else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) + def backward(y, sgd=None): assert idx >= 0, idx dX = ops.allocate(X.shape) dX[:, idx] += y return dX + return output, backward + return layerize(forward) -def zero_init(model): - def _hook(self, X, y=None): - self.W.fill(0) - model.on_data_hooks.append(_hook) - return model - - def doc2feats(cols=None): if cols is None: cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + def forward(docs, drop=0.): feats = [] for doc in docs: feats.append(doc.to_array(cols)) return feats, None + model = layerize(forward) model.cols = cols return model @@ -357,68 +334,14 @@ def print_shape(prefix): @layerize def get_token_vectors(tokens_attrs_vectors, drop=0.): - ops = Model.ops tokens, attrs, vectors = tokens_attrs_vectors + def backward(d_output, sgd=None): return (tokens, d_output) + return vectors, backward -def fine_tune(embedding, combine=None): - if combine is not None: - raise NotImplementedError( - "fine_tune currently only supports addition. Set combine=None") - def fine_tune_fwd(docs_tokvecs, drop=0.): - docs, tokvecs = docs_tokvecs - - lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') - - vecs, bp_vecs = embedding.begin_update(docs, drop=drop) - flat_tokvecs = embedding.ops.flatten(tokvecs) - flat_vecs = embedding.ops.flatten(vecs) - output = embedding.ops.unflatten( - (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths) - - def fine_tune_bwd(d_output, sgd=None): - flat_grad = model.ops.flatten(d_output) - model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum() - model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum() - - bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd) - if sgd is not None: - sgd(model._mem.weights, model._mem.gradient, key=model.id) - return [d_o * model.mix[0] for d_o in d_output] - return output, fine_tune_bwd - - def fine_tune_predict(docs_tokvecs): - docs, tokvecs = docs_tokvecs - vecs = embedding(docs) - return [model.mix[0]*tv+model.mix[1]*v - for tv, v in zip(tokvecs, vecs)] - - model = wrap(fine_tune_fwd, embedding) - model.mix = model._mem.add((model.id, 'mix'), (2,)) - model.mix.fill(0.5) - model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) - model.predict = fine_tune_predict - return model - - -@layerize -def flatten(seqs, drop=0.): - if isinstance(seqs[0], numpy.ndarray): - ops = NumpyOps() - elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray): - ops = CupyOps() - else: - raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0])) - lengths = [len(seq) for seq in seqs] - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths) - X = ops.xp.vstack(seqs) - return X, finish_update - - @layerize def logistic(X, drop=0.): xp = get_array_module(X) @@ -428,9 +351,11 @@ def logistic(X, drop=0.): X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) Y = 1. / (1. + xp.exp(-X)) + def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1-Y)) return dX + return Y, logistic_bwd @@ -440,42 +365,47 @@ def zero_init(model): model.on_data_hooks.append(_zero_init_impl) return model + @layerize def preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] - keys = [a[:, 0] for a in keys] ops = Model.ops lengths = ops.asarray([arr.shape[0] for arr in keys]) keys = ops.xp.concatenate(keys) vals = ops.allocate(keys.shape[0]) + 1 return (keys, vals, lengths), None + def getitem(i): def getitem_fwd(X, drop=0.): return X[i], None return layerize(getitem_fwd) -def build_tagger_model(nr_class, token_vector_width, **cfg): - embed_size = util.env_opt('embed_size', 7500) - with Model.define_operators({'>>': chain, '+': add}): - # Input: (doc, tensor) tuples - private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) +def build_tagger_model(nr_class, **cfg): + embed_size = util.env_opt('embed_size', 7000) + if 'token_vector_width' in cfg: + token_vector_width = cfg['token_vector_width'] + else: + token_vector_width = util.env_opt('token_vector_width', 128) + pretrained_dims = cfg.get('pretrained_dims', 0) + with Model.define_operators({'>>': chain, '+': add}): + if 'tok2vec' in cfg: + tok2vec = cfg['tok2vec'] + else: + tok2vec = Tok2Vec(token_vector_width, embed_size, + pretrained_dims=pretrained_dims) model = ( - fine_tune(private_tok2vec) - >> with_flatten( - Maxout(token_vector_width, token_vector_width) - >> Softmax(nr_class, token_vector_width) - ) + tok2vec + >> with_flatten(Softmax(nr_class, token_vector_width)) ) model.nI = None + model.tok2vec = tok2vec return model @layerize def SpacyVectors(docs, drop=0.): - xp = get_array_module(docs[0].vocab.vectors.data) - width = docs[0].vocab.vectors.data.shape[1] batch = [] for doc in docs: indices = numpy.zeros((len(doc),), dtype='i') @@ -489,40 +419,16 @@ def SpacyVectors(docs, drop=0.): return batch, None -def foreach(layer, drop_factor=1.0): - '''Map a layer across elements in a list''' - def foreach_fwd(Xs, drop=0.): - drop *= drop_factor - ys = [] - backprops = [] - for X in Xs: - y, bp_y = layer.begin_update(X, drop=drop) - ys.append(y) - backprops.append(bp_y) - def foreach_bwd(d_ys, sgd=None): - d_Xs = [] - for d_y, bp_y in zip(d_ys, backprops): - if bp_y is not None and bp_y is not None: - d_Xs.append(d_y, sgd=sgd) - else: - d_Xs.append(None) - return d_Xs - return ys, foreach_bwd - model = wrap(foreach_fwd, layer) - return model - - def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) + pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}): if cfg.get('low_data'): model = ( SpacyVectors >> flatten_add_lengths - >> with_getitem(0, - Affine(width, 300) - ) + >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 @@ -531,7 +437,6 @@ def build_text_classifier(nr_class, width=64, **cfg): ) return model - lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width//2, nr_vector, column=2) suffix = HashEmbed(width//2, nr_vector, column=3) @@ -548,18 +453,24 @@ def build_text_classifier(nr_class, width=64, **cfg): ) ) - static_vectors = ( - SpacyVectors - >> with_flatten(Affine(width, 300)) - ) - - cnn_model = ( + if pretrained_dims: + static_vectors = ( + SpacyVectors + >> with_flatten(Affine(width, pretrained_dims)) + ) # TODO Make concatenate support lists - concatenate_lists(trained_vectors, static_vectors) + vectors = concatenate_lists(trained_vectors, static_vectors) + vectors_width = width*2 + else: + vectors = trained_vectors + vectors_width = width + static_vectors = None + cnn_model = ( + vectors >> with_flatten( - LN(Maxout(width, width*2)) + LN(Maxout(width, vectors_width)) >> Residual( - (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3))) + (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) ) ** 2, pad=2 ) >> flatten_add_lengths @@ -579,39 +490,44 @@ def build_text_classifier(nr_class, width=64, **cfg): >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> logistic ) - + model.nO = nr_class model.lsuv = False return model + @layerize def flatten(seqs, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') + def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=0) + X = ops.flatten(seqs, pad=0) return X, finish_update -def concatenate_lists(*layers, **kwargs): # pragma: no cover - '''Compose two or more models `f`, `g`, etc, such that their outputs are +def concatenate_lists(*layers, **kwargs): # pragma: no cover + """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - ''' + """ if not layers: return noop() drop_factor = kwargs.get('drop_factor', 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) + def concatenate_lists_fwd(Xs, drop=0.): drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype='i') flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) + def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) + return ys, concatenate_lists_bwd + model = wrap(concatenate_lists_fwd, concat) return model - - diff --git a/spacy/about.py b/spacy/about.py index d566fbb1f..6f029bd9d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,15 +3,16 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy-nightly' -__version__ = '2.0.0a13' +__version__ = '2.0.0a18' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' __email__ = 'contact@explosion.ai' __license__ = 'MIT' +__release__ = False -__docs_models__ = 'https://spacy.io/docs/usage/models' +__docs_models__ = 'https://alpha.spacy.io/usage/models' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' -__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' +__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-nightly.json' __model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/develop/templates/model/' diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index a8ee9cac0..74397fa64 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,5 +1,5 @@ # Reserve 64 values for flag features -cpdef enum attr_id_t: +cdef enum attr_id_t: NULL_ATTR IS_ALPHA IS_ASCII diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index ba95e1e72..8113ffebe 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -94,23 +94,19 @@ IDS = { # ATTR IDs, in order of the symbol NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +locals().update(IDS) def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): """ Normalize a dictionary of attributes, converting them to ints. - Arguments: - stringy_attrs (dict): - Dictionary keyed by attribute string names. Values can be ints or strings. - - strings_map (StringStore): - Defaults to None. If provided, encodes string values into ints. - - Returns: - inty_attrs (dict): - Attributes dictionary with keys and optionally values converted to - ints. + stringy_attrs (dict): Dictionary keyed by attribute string names. Values + can be ints or strings. + strings_map (StringStore): Defaults to None. If provided, encodes string + values into ints. + RETURNS (dict): Attributes dictionary with keys and optionally values + converted to ints. """ inty_attrs = {} if _do_deprecated: diff --git a/spacy/cfile.pxd b/spacy/cfile.pxd deleted file mode 100644 index b95fbb2be..000000000 --- a/spacy/cfile.pxd +++ /dev/null @@ -1,33 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef unsigned char* data - cdef int is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int i # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * - - - -cdef class StringCFile: - cdef unsigned char* data - cdef int is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int i # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx deleted file mode 100644 index 006ff78ac..000000000 --- a/spacy/cfile.pyx +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from libc.stdio cimport fopen, fclose, fread, fwrite -from libc.string cimport memcpy - - -cdef class CFile: - def __init__(self, loc, mode, on_open_error=None): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - self.mem = Pool() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - if on_open_error is not None: - on_open_error() - else: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) - - -cdef class StringCFile: - def __init__(self, bytes data, mode, on_open_error=None): - self.mem = Pool() - self.is_open = 1 if 'w' in mode else 0 - self._capacity = max(len(data), 8) - self.size = len(data) - self.i = 0 - self.data = self.mem.alloc(1, self._capacity) - for i in range(len(data)): - self.data[i] = data[i] - - def __dealloc__(self): - # Important to override this -- or - # we try to close a non-existant file pointer! - pass - - def close(self): - self.is_open = False - - def string_data(self): - cdef bytes byte_string = b'\0' * (self.size) - bytes_ptr = byte_string - for i in range(self.size): - bytes_ptr[i] = self.data[i] - print(byte_string) - return byte_string - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - if self.i+(number * elem_size) < self.size: - memcpy(dest, &self.data[self.i], elem_size * number) - self.i += elem_size * number - - cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: - write_size = number * elem_size - if (self.size + write_size) >= self._capacity: - self._capacity = (self.size + write_size) * 2 - self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, write_size) - self.size += write_size - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index e58c94642..b807480ca 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,5 +4,8 @@ from .link import link from .package import package from .profile import profile from .train import train +from .evaluate import evaluate from .convert import convert from .model import model +from .vocab import make_vocab as vocab +from .validate import validate diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index fef6753e6..ad17844a1 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -4,17 +4,17 @@ from __future__ import unicode_literals import plac from pathlib import Path -from .converters import conllu2json, iob2json +from .converters import conllu2json, iob2json, conll_ner2json from ..util import prints -# Converters are matched by file extension. To add a converter, add a new entry -# to this dict with the file extension mapped to the converter function imported -# from /converters. - +# Converters are matched by file extension. To add a converter, add a new +# entry to this dict with the file extension mapped to the converter function +# imported from /converters. CONVERTERS = { - '.conllu': conllu2json, - '.conll': conllu2json, - '.iob': iob2json + 'conllu': conllu2json, + 'conll': conllu2json, + 'ner': conll_ner2json, + 'iob': iob2json, } @@ -22,9 +22,10 @@ CONVERTERS = { input_file=("input file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str), n_sents=("Number of sentences per doc", "option", "n", int), - morphology=("Enable appending morphology to tags", "flag", "m", bool) -) -def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): + converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), + morphology=("Enable appending morphology to tags", "flag", "m", bool)) +def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, + converter='auto'): """ Convert files into JSON format for use with train command and other experiment management functions. @@ -35,9 +36,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): prints(input_path, title="Input file not found", exits=1) if not output_path.exists(): prints(output_path, title="Output directory not found", exits=1) - file_ext = input_path.suffix - if not file_ext in CONVERTERS: - prints("Can't find converter for %s" % input_path.parts[-1], - title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, - n_sents=n_sents, use_morphology=morphology) + if converter == 'auto': + converter = input_path.suffix[1:] + if converter not in CONVERTERS: + prints("Can't find converter for %s" % converter, + title="Unknown format", exits=1) + func = CONVERTERS[converter] + func(input_path, output_path, + n_sents=n_sents, use_morphology=morphology) diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index 9026d16c6..02b596d4d 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,2 +1,3 @@ from .conllu2json import conllu2json from .iob2json import iob2json +from .conll_ner2json import conll_ner2json diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py new file mode 100644 index 000000000..fb2979652 --- /dev/null +++ b/spacy/cli/converters/conll_ner2json.py @@ -0,0 +1,51 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...compat import json_dumps, path2str +from ...util import prints +from ...gold import iob_to_biluo + + +def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): + """ + Convert files in the CoNLL-2003 NER format into JSON format for use with + train cli. + """ + docs = read_conll_ner(input_path) + + output_filename = input_path.parts[-1].replace(".conll", "") + ".json" + output_filename = input_path.parts[-1].replace(".conll", "") + ".json" + output_file = output_path / output_filename + with output_file.open('w', encoding='utf-8') as f: + f.write(json_dumps(docs)) + prints("Created %d documents" % len(docs), + title="Generated output file %s" % path2str(output_file)) + + +def read_conll_ner(input_path): + text = input_path.open('r', encoding='utf-8').read() + i = 0 + delimit_docs = '-DOCSTART- -X- O O' + output_docs = [] + for doc in text.strip().split(delimit_docs): + doc = doc.strip() + if not doc: + continue + output_doc = [] + for sent in doc.split('\n\n'): + sent = sent.strip() + if not sent: + continue + lines = [line.strip() for line in sent.split('\n') if line.strip()] + words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) + biluo_ents = iob_to_biluo(iob_ents) + output_doc.append({'tokens': [ + {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in + zip(words, tags, biluo_ents) + ]}) + output_docs.append({ + 'id': len(output_docs), + 'paragraphs': [{'sentences': output_doc}] + }) + output_doc = [] + return output_docs diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 4849345e9..74bc22ada 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,5 +1,6 @@ # coding: utf8 from __future__ import unicode_literals +from cytoolz import partition_all, concat from ...compat import json_dumps, path2str from ...util import prints @@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): """ Convert IOB files into JSON format for use with train cli. """ - # TODO: This isn't complete yet -- need to map from IOB to - # BILUO with input_path.open('r', encoding='utf8') as file_: - docs = read_iob(file_) - + sentences = read_iob(file_) + docs = merge_sentences(sentences, n_sents) output_filename = input_path.parts[-1].replace(".iob", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: @@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): title="Generated output file %s" % path2str(output_file)) -def read_iob(file_): +def read_iob(raw_sents): sentences = [] - for line in file_: + for line in raw_sents: if not line.strip(): continue tokens = [t.split('|') for t in line.split()] @@ -43,3 +42,15 @@ def read_iob(file_): paragraphs = [{'sentences': [sent]} for sent in sentences] docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] return docs + +def merge_sentences(docs, n_sents): + counter = 0 + merged = [] + for group in partition_all(n_sents, docs): + group = list(group) + first = group.pop(0) + to_extend = first['paragraphs'][0]['sentences'] + for sent in group[1:]: + to_extend.extend(sent['paragraphs'][0]['sentences']) + merged.append(first) + return merged diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 28ae07865..0d3f11153 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -13,10 +13,9 @@ from .. import about @plac.annotations( - model=("model to download (shortcut or model name)", "positional", None, str), + model=("model to download, shortcut or name)", "positional", None, str), direct=("force direct download. Needs model name with version and won't " - "perform compatibility check", "flag", "d", bool) -) + "perform compatibility check", "flag", "d", bool)) def download(cmd, model, direct=False): """ Download compatible model from default download path using pip. Model @@ -30,21 +29,25 @@ def download(cmd, model, direct=False): model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) + dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, + v=version)) if dl == 0: try: # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a package, - # which fails if model was just installed via subprocess + # pip.get_installed_distributions() to check if model is a + # package, which fails if model was just installed via + # subprocess package_path = get_package_path(model_name) - link(None, model_name, model, force=True, model_path=package_path) + link(None, model_name, model, force=True, + model_path=package_path) except: - # Dirty, but since spacy.download and the auto-linking is mostly - # a convenience wrapper, it's best to show a success message and - # loading instructions, even if linking fails. - prints("Creating a shortcut link for 'en' didn't work (maybe you " - "don't have admin permissions?), but you can still load " - "the model via its full package name:", + # Dirty, but since spacy.download and the auto-linking is + # mostly a convenience wrapper, it's best to show a success + # message and loading instructions, even if linking fails. + prints( + "Creating a shortcut link for 'en' didn't work (maybe " + "you don't have admin permissions?), but you can still " + "load the model via its full package name:", "nlp = spacy.load('%s')" % model_name, title="Download successful") @@ -52,9 +55,10 @@ def download(cmd, model, direct=False): def get_json(url, desc): r = requests.get(url) if r.status_code != 200: - prints("Couldn't fetch %s. Please find a model for your spaCy installation " - "(v%s), and download it manually." % (desc, about.__version__), - about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1) + msg = ("Couldn't fetch %s. Please find a model for your spaCy " + "installation (v%s), and download it manually.") + prints(msg % (desc, about.__version__), about.__docs_models__, + title="Server error (%d)" % r.status_code, exits=1) return r.json() @@ -71,13 +75,13 @@ def get_compatibility(): def get_version(model, comp): if model not in comp: version = about.__version__ - prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), - title="Compatibility error", exits=1) + msg = "No compatible model found for '%s' (spaCy v%s)." + prints(msg % (model, version), title="Compatibility error", exits=1) return comp[model][0] def download_model(filename): download_url = about.__download_url__ + '/' + filename - return subprocess.call([sys.executable, '-m', - 'pip', 'install', '--no-cache-dir', download_url], - env=os.environ.copy()) + return subprocess.call( + [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', + download_url], env=os.environ.copy()) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py new file mode 100644 index 000000000..d7695fd73 --- /dev/null +++ b/spacy/cli/evaluate.py @@ -0,0 +1,113 @@ +# coding: utf8 +from __future__ import unicode_literals, division, print_function + +import plac +from timeit import default_timer as timer +import random +import numpy.random + +from ..gold import GoldCorpus +from ..util import prints +from .. import util +from .. import displacy + + +random.seed(0) +numpy.random.seed(0) + + +@plac.annotations( + model=("model name or path", "positional", None, str), + data_path=("location of JSON-formatted evaluation data", "positional", + None, str), + gold_preproc=("use gold preprocessing", "flag", "G", bool), + gpu_id=("use GPU", "option", "g", int), + displacy_path=("directory to output rendered parses as HTML", "option", + "dp", str), + displacy_limit=("limit of parses to render as HTML", "option", "dl", int)) +def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, + displacy_path=None, displacy_limit=25): + """ + Evaluate a model. To render a sample of parses in a HTML file, set an + output directory as the displacy_path argument. + """ + if gpu_id >= 0: + util.use_gpu(gpu_id) + util.set_env_log(False) + data_path = util.ensure_path(data_path) + displacy_path = util.ensure_path(displacy_path) + if not data_path.exists(): + prints(data_path, title="Evaluation data not found", exits=1) + if displacy_path and not displacy_path.exists(): + prints(displacy_path, title="Visualization output directory not found", + exits=1) + corpus = GoldCorpus(data_path, data_path) + nlp = util.load_model(model) + dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) + begin = timer() + scorer = nlp.evaluate(dev_docs, verbose=False) + end = timer() + nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + print_results(scorer, time=end - begin, words=nwords, + wps=nwords / (end - begin)) + if displacy_path: + docs, golds = zip(*dev_docs) + render_deps = 'parser' in nlp.meta.get('pipeline', []) + render_ents = 'ner' in nlp.meta.get('pipeline', []) + render_parses(docs, displacy_path, model_name=model, + limit=displacy_limit, deps=render_deps, ents=render_ents) + msg = "Generated %s parses as HTML" % displacy_limit + prints(displacy_path, title=msg) + + +def render_parses(docs, output_path, model_name='', limit=250, deps=True, + ents=True): + docs[0].user_data['title'] = model_name + if ents: + with (output_path / 'entities.html').open('w') as file_: + html = displacy.render(docs[:limit], style='ent', page=True) + file_.write(html) + if deps: + with (output_path / 'parses.html').open('w') as file_: + html = displacy.render(docs[:limit], style='dep', page=True, + options={'compact': True}) + file_.write(html) + + +def print_progress(itn, losses, dev_scores, wps=0.0): + scores = {} + for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', + 'ents_p', 'ents_r', 'ents_f', 'wps']: + scores[col] = 0.0 + scores['dep_loss'] = losses.get('parser', 0.0) + scores['ner_loss'] = losses.get('ner', 0.0) + scores['tag_loss'] = losses.get('tagger', 0.0) + scores.update(dev_scores) + scores['wps'] = wps + tpl = '\t'.join(( + '{:d}', + '{dep_loss:.3f}', + '{ner_loss:.3f}', + '{uas:.3f}', + '{ents_p:.3f}', + '{ents_r:.3f}', + '{ents_f:.3f}', + '{tags_acc:.3f}', + '{token_acc:.3f}', + '{wps:.1f}')) + print(tpl.format(itn, **scores)) + + +def print_results(scorer, time, words, wps): + results = { + 'Time': '%.2f s' % time, + 'Words': words, + 'Words/s': '%.0f' % wps, + 'TOK': '%.2f' % scorer.token_acc, + 'POS': '%.2f' % scorer.tags_acc, + 'UAS': '%.2f' % scorer.uas, + 'LAS': '%.2f' % scorer.las, + 'NER P': '%.2f' % scorer.ents_p, + 'NER R': '%.2f' % scorer.ents_r, + 'NER F': '%.2f' % scorer.ents_f} + util.print_table(results, title="Results") diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 5d45b271c..3636494fb 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -12,8 +12,7 @@ from .. import util @plac.annotations( model=("optional: shortcut link of model", "positional", None, str), - markdown=("generate Markdown for GitHub issues", "flag", "md", str) -) + markdown=("generate Markdown for GitHub issues", "flag", "md", str)) def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 712a05aee..cfbc97e3e 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -12,8 +12,7 @@ from .. import util @plac.annotations( origin=("package name or local path to model", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool) -) + force=("force overwriting of existing link", "flag", "f", bool)) def link(cmd, origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts @@ -27,6 +26,13 @@ def link(cmd, origin, link_name, force=False, model_path=None): if not model_path.exists(): prints("The data should be located in %s" % path2str(model_path), title="Can't locate model data", exits=1) + data_path = util.get_data_path() + if not data_path or not data_path.exists(): + spacy_loc = Path(__file__).parent.parent + prints("Make sure a directory `/data` exists within your spaCy " + "installation and try again. The data directory should be " + "located here:", path2str(spacy_loc), exits=1, + title="Can't find the spaCy data path to create model symlink") link_path = util.get_data_path() / link_name if link_path.exists() and not force: prints("To overwrite an existing link, use the --force flag.", @@ -39,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None): # This is quite dirty, but just making sure other errors are caught. prints("Creating a symlink in spacy/data failed. Make sure you have " "the required permissions and try re-running the command as " - "admin, or use a virtualenv. You can still import the model as a " - "module and call its load() method, or create the symlink manually.", + "admin, or use a virtualenv. You can still import the model as " + "a module and call its load() method, or create the symlink " + "manually.", "%s --> %s" % (path2str(model_path), path2str(link_path)), title="Error: Couldn't link model to '%s'" % link_name) raise diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 14e75647e..bcc1626bc 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -1,8 +1,11 @@ # coding: utf8 from __future__ import unicode_literals -import bz2 -import gzip +try: + import bz2 + import gzip +except ImportError: + pass import math from ast import literal_eval from pathlib import Path diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 7019819a7..3157ba99d 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -16,10 +16,13 @@ from .. import about input_dir=("directory with model data", "positional", None, str), output_dir=("output parent directory", "positional", None, str), meta_path=("path to meta.json", "option", "m", str), - create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool), - force=("force overwriting of existing folder in output directory", "flag", "f", bool) -) -def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False): + create_meta=("create meta.json, even if one exists in directory – if " + "existing meta is found, entries are shown as defaults in " + "the command line prompt", "flag", "c", bool), + force=("force overwriting of existing model directory in output directory", + "flag", "f", bool)) +def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, + force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified @@ -39,26 +42,28 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force template_manifest = get_template('MANIFEST.in') template_init = get_template('xx_model_name/__init__.py') meta_path = meta_path or input_path / 'meta.json' - if not create_meta and meta_path.is_file(): - prints(meta_path, title="Reading meta.json from file") + if meta_path.is_file(): meta = util.read_json(meta_path) - else: - meta = generate_meta() + if not create_meta: # only print this if user doesn't want to overwrite + prints(meta_path, title="Loaded meta.json from file") + else: + meta = generate_meta(input_dir, meta) meta = validate_meta(meta, ['lang', 'name', 'version']) - model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] main_path = output_path / model_name_v package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + shutil.copytree(path2str(input_path), + path2str(package_path / model_name_v)) create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) create_file(package_path / '__init__.py', template_init) - prints(main_path, "To build the package, run `python setup.py sdist` in this " - "directory.", title="Successfully created package '%s'" % model_name_v) + prints(main_path, "To build the package, run `python setup.py sdist` in " + "this directory.", + title="Successfully created package '%s'" % model_name_v) def create_dirs(package_path, force): @@ -66,9 +71,10 @@ def create_dirs(package_path, force): if force: shutil.rmtree(path2str(package_path)) else: - prints(package_path, "Please delete the directory and try again, or " - "use the --force flag to overwrite existing directories.", - title="Package directory already exists", exits=1) + prints(package_path, "Please delete the directory and try again, " + "or use the --force flag to overwrite existing " + "directories.", title="Package directory already exists", + exits=1) Path.mkdir(package_path, parents=True) @@ -77,38 +83,34 @@ def create_file(file_path, contents): file_path.open('w', encoding='utf-8').write(contents) -def generate_meta(): - settings = [('lang', 'Model language', 'en'), - ('name', 'Model name', 'model'), - ('version', 'Model version', '0.0.0'), - ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__), - ('description', 'Model description', False), - ('author', 'Author', False), - ('email', 'Author email', False), - ('url', 'Author website', False), - ('license', 'License', 'CC BY-NC 3.0')] - prints("Enter the package settings for your model.", title="Generating meta.json") - meta = {} +def generate_meta(model_path, existing_meta): + meta = existing_meta or {} + settings = [('lang', 'Model language', meta.get('lang', 'en')), + ('name', 'Model name', meta.get('name', 'model')), + ('version', 'Model version', meta.get('version', '0.0.0')), + ('spacy_version', 'Required spaCy version', + '>=%s,<3.0.0' % about.__version__), + ('description', 'Model description', + meta.get('description', False)), + ('author', 'Author', meta.get('author', False)), + ('email', 'Author email', meta.get('email', False)), + ('url', 'Author website', meta.get('url', False)), + ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))] + nlp = util.load_model_from_path(Path(model_path)) + meta['pipeline'] = nlp.pipe_names + meta['vectors'] = {'width': nlp.vocab.vectors_length, + 'entries': len(nlp.vocab.vectors)} + prints("Enter the package settings for your model. The following " + "information will be read from your model data: pipeline, vectors.", + title="Generating meta.json") for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response - meta['pipeline'] = generate_pipeline() if about.__title__ != 'spacy': meta['parent_package'] = about.__title__ return meta -def generate_pipeline(): - prints("If set to 'True', the default pipeline is used. If set to 'False', " - "the pipeline will be disabled. Components should be specified as a " - "comma-separated list of component names, e.g. tensorizer, tagger, " - "parser, ner. For more information, see the docs on processing pipelines.", - title="Enter your model's pipeline components") - pipeline = util.get_raw_input("Pipeline components", True) - replace = {'True': True, 'False': False} - return replace[pipeline] if pipeline in replace else pipeline.split(', ') - - def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index db6fc5b41..a394989d0 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -27,15 +27,15 @@ def read_inputs(loc): @plac.annotations( lang=("model/language", "positional", None, str), - inputs=("Location of input file", "positional", None, read_inputs) -) + inputs=("Location of input file", "positional", None, read_inputs)) def profile(cmd, lang, inputs=None): """ Profile a spaCy pipeline, to find out which functions take the most time. """ - nlp = spacy.load(lang) + nlp = spacy.load(lang) texts = list(cytoolz.take(10000, inputs)) - cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") + cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), + "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() diff --git a/spacy/cli/train.py b/spacy/cli/train.py index a22db6abc..34117db22 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -2,42 +2,48 @@ from __future__ import unicode_literals, division, print_function import plac -import json -from collections import defaultdict -import cytoolz from pathlib import Path import dill import tqdm -from thinc.neural.optimizers import linear_decay +from thinc.neural._classes.model import Model from timeit import default_timer as timer +import random +import numpy.random -from ..tokens.doc import Doc -from ..scorer import Scorer -from ..gold import GoldParse, merge_sents from ..gold import GoldCorpus, minibatch from ..util import prints from .. import util +from .. import about from .. import displacy from ..compat import json_dumps +random.seed(0) +numpy.random.seed(0) + @plac.annotations( lang=("model language", "positional", None, str), output_dir=("output directory to store model in", "positional", None, str), - train_data=("location of JSON-formatted training data", "positional", None, str), - dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", + None, str), + dev_data=("location of JSON-formatted development data (optional)", + "positional", None, str), n_iter=("number of iterations", "option", "n", int), n_sents=("number of sentences", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), - resume=("Whether to resume training", "flag", "R", bool), + vectors=("Model to load vectors from", "option", "v"), + vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool), -) -def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, - use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False, - gold_preproc=False): + version=("Model version", "option", "V", str), + meta_path=("Optional path to meta.json. All relevant properties will be " + "overwritten.", "option", "m", Path)) +def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, + use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False, + no_parser=False, no_entities=False, gold_preproc=False, + version="0.0.0", meta_path=None): """ Train a model. Expects data in spaCy's JSON format. """ @@ -46,19 +52,29 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) + meta_path = util.ensure_path(meta_path) if not output_path.exists(): output_path.mkdir() if not train_path.exists(): prints(train_path, title="Training data not found", exits=1) if dev_path and not dev_path.exists(): prints(dev_path, title="Development data not found", exits=1) + if meta_path is not None and not meta_path.exists(): + prints(meta_path, title="meta.json not found", exits=1) + meta = util.read_json(meta_path) if meta_path else {} + if not isinstance(meta, dict): + prints("Expected dict but got: {}".format(type(meta)), + title="Not a valid meta.json format", exits=1) + meta.setdefault('lang', lang) + meta.setdefault('name', 'unnamed') - lang_class = util.get_lang_class(lang) - - pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] - if no_tagger and 'tags' in pipeline: pipeline.remove('tags') - if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') - if no_entities and 'entities' in pipeline: pipeline.remove('entities') + pipeline = ['tagger', 'parser', 'ner'] + if no_tagger and 'tagger' in pipeline: + pipeline.remove('tagger') + if no_parser and 'parser' in pipeline: + pipeline.remove('parser') + if no_entities and 'ner' in pipeline: + pipeline.remove('ner') # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. @@ -68,55 +84,91 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), - util.env_opt('batch_to', 64), + util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) - - if resume: - prints(output_path / 'model9.pickle', title="Resuming training") - nlp = dill.load((output_path / 'model9.pickle').open('rb')) - else: - nlp = lang_class(pipeline=pipeline) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() + lang_class = util.get_lang_class(lang) + nlp = lang_class() + meta['pipeline'] = pipeline + nlp.meta.update(meta) + if vectors: + util.load_model(vectors, vocab=nlp.vocab) + if vectors_limit is not None: + nlp.vocab.prune_vectors(vectors_limit) + for name in pipeline: + nlp.add_pipe(nlp.create_pipe(name), name=name) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) + nlp._optimizer = None - print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") + print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") try: + train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, + gold_preproc=gold_preproc, max_length=0) + train_docs = list(train_docs) for i in range(n_iter): - if resume: - i += 20 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, - gold_preproc=gold_preproc, max_length=0) losses = {} for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, - drop=next(dropout_rates), losses=losses, - update_shared=True) + drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) - nlp_loaded = lang_class(pipeline=pipeline) - nlp_loaded = nlp_loaded.from_disk(epoch_model_path) - scorer = nlp_loaded.evaluate( - corpus.dev_docs( + nlp_loaded = util.load_model_from_path(epoch_model_path) + dev_docs = list(corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc)) - acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') + nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs) + end_time = timer() + if use_gpu < 0: + gpu_wps = None + cpu_wps = nwords/(end_time-start_time) + else: + gpu_wps = nwords/(end_time-start_time) + with Model.use_device('cpu'): + nlp_loaded = util.load_model_from_path(epoch_model_path) + dev_docs = list(corpus.dev_docs( + nlp_loaded, gold_preproc=gold_preproc)) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs) + end_time = timer() + cpu_wps = nwords/(end_time-start_time) + acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) + meta_loc = output_path / ('model%d' % i) / 'meta.json' + meta['accuracy'] = scorer.scores + meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps, + 'gpu': gpu_wps} + meta['vectors'] = {'width': nlp.vocab.vectors_length, + 'entries': len(nlp.vocab.vectors)} + meta['lang'] = nlp.lang + meta['pipeline'] = pipeline + meta['spacy_version'] = '>=%s' % about.__version__ + meta.setdefault('name', 'model%d' % i) + meta.setdefault('version', version) + + with meta_loc.open('w') as file_: + file_.write(json_dumps(meta)) util.set_env_log(True) - print_progress(i, losses, scorer.scores) + print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, + gpu_wps=gpu_wps) finally: print("Saving model...") - with (output_path / 'model-final.pickle').open('wb') as file_: - with nlp.use_params(optimizer.averages): - dill.dump(nlp, file_, -1) + try: + with (output_path / 'model-final.pickle').open('wb') as file_: + with nlp.use_params(optimizer.averages): + dill.dump(nlp, file_, -1) + except: + print("Error saving model") def _render_parses(i, to_render): @@ -129,25 +181,30 @@ def _render_parses(i, to_render): file_.write(html) -def print_progress(itn, losses, dev_scores, wps=0.0): +def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores = {} for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', - 'ents_p', 'ents_r', 'ents_f', 'wps']: + 'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: scores[col] = 0.0 scores['dep_loss'] = losses.get('parser', 0.0) + scores['ner_loss'] = losses.get('ner', 0.0) scores['tag_loss'] = losses.get('tagger', 0.0) scores.update(dev_scores) - scores['wps'] = wps + scores['cpu_wps'] = cpu_wps + scores['gpu_wps'] = gpu_wps or 0.0 tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', + '{ner_loss:.3f}', '{uas:.3f}', '{ents_p:.3f}', '{ents_r:.3f}', '{ents_f:.3f}', '{tags_acc:.3f}', '{token_acc:.3f}', - '{wps:.1f}')) + '{cpu_wps:.1f}', + '{gpu_wps:.1f}', + )) print(tpl.format(itn, **scores)) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py new file mode 100644 index 000000000..1c645a554 --- /dev/null +++ b/spacy/cli/validate.py @@ -0,0 +1,126 @@ +# coding: utf8 +from __future__ import unicode_literals, print_function + +import requests +import pkg_resources +from pathlib import Path + +from ..compat import path2str, locale_escape +from ..util import prints, get_data_path, read_json +from .. import about + + +def validate(cmd): + """Validate that the currently installed version of spaCy is compatible + with the installed models. Should be run after `pip install -U spacy`. + """ + r = requests.get(about.__compatibility__) + if r.status_code != 200: + prints("Couldn't fetch compatibility table.", + title="Server error (%d)" % r.status_code, exits=1) + compat = r.json()['spacy'] + all_models = set() + for spacy_v, models in dict(compat).items(): + all_models.update(models.keys()) + for model, model_vs in models.items(): + compat[spacy_v][model] = [reformat_version(v) for v in model_vs] + + current_compat = compat[about.__version__] + model_links = get_model_links(current_compat) + model_pkgs = get_model_pkgs(current_compat, all_models) + incompat_links = {l for l, d in model_links.items() if not d['compat']} + incompat_models = {d['name'] for _, d in model_pkgs.items() + if not d['compat']} + incompat_models.update([d['name'] for _, d in model_links.items() + if not d['compat']]) + na_models = [m for m in incompat_models if m not in current_compat] + update_models = [m for m in incompat_models if m in current_compat] + + prints(path2str(Path(__file__).parent.parent), + title="Installed models (spaCy v{})".format(about.__version__)) + if model_links or model_pkgs: + print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) + for name, data in model_pkgs.items(): + print(get_model_row(current_compat, name, data, 'package')) + for name, data in model_links.items(): + print(get_model_row(current_compat, name, data, 'link')) + else: + prints("No models found in your current environment.", exits=0) + + if update_models: + cmd = ' python -m spacy download {}' + print("\n Use the following commands to update the model packages:") + print('\n'.join([cmd.format(pkg) for pkg in update_models])) + + if na_models: + prints("The following models are not available for spaCy v{}: {}" + .format(about.__version__, ', '.join(na_models))) + + if incompat_links: + prints("You may also want to overwrite the incompatible links using " + "the `spacy link` command with `--force`, or remove them from " + "the data directory. Data path: {}" + .format(path2str(get_data_path()))) + + +def get_model_links(compat): + links = {} + data_path = get_data_path() + if data_path: + models = [p for p in data_path.iterdir() if is_model_path(p)] + for model in models: + meta_path = Path(model) / 'meta.json' + if not meta_path.exists(): + continue + meta = read_json(meta_path) + link = model.parts[-1] + name = meta['lang'] + '_' + meta['name'] + links[link] = {'name': name, 'version': meta['version'], + 'compat': is_compat(compat, name, meta['version'])} + return links + + +def get_model_pkgs(compat, all_models): + pkgs = {} + for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): + package = pkg_name.replace('-', '_') + if package in all_models: + version = pkg_data.version + pkgs[pkg_name] = {'name': package, 'version': version, + 'compat': is_compat(compat, package, version)} + return pkgs + + +def get_model_row(compat, name, data, type='package'): + tpl_red = '\x1b[38;5;1m{}\x1b[0m' + tpl_green = '\x1b[38;5;2m{}\x1b[0m' + if data['compat']: + comp = tpl_green.format(locale_escape('✔', errors='ignore')) + version = tpl_green.format(data['version']) + else: + comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0]) + version = tpl_red.format(data['version']) + return get_row(type, name, data['name'], version, comp) + + +def get_row(*args): + tpl_row = ' {:<10}' + (' {:<20}' * 4) + return tpl_row.format(*args) + + +def is_model_path(model_path): + exclude = ['cache', 'pycache', '__pycache__'] + name = model_path.parts[-1] + return (model_path.is_dir() and name not in exclude + and not name.startswith('.')) + + +def is_compat(compat, name, version): + return name in compat and version in compat[name] + + +def reformat_version(version): + """Hack to reformat old versions ending on '-alpha' to match pip format.""" + if version.endswith('-alpha'): + return version.replace('-alpha', 'a0') + return version.replace('-alpha', 'a') diff --git a/spacy/cli/vocab.py b/spacy/cli/vocab.py new file mode 100644 index 000000000..d05eff3f0 --- /dev/null +++ b/spacy/cli/vocab.py @@ -0,0 +1,54 @@ +# coding: utf8 +from __future__ import unicode_literals + +import plac +import json +import spacy +import numpy +from pathlib import Path + +from ..util import prints, ensure_path + + +@plac.annotations( + lang=("model language", "positional", None, str), + output_dir=("model output directory", "positional", None, Path), + lexemes_loc=("location of JSONL-formatted lexical data", "positional", + None, Path), + vectors_loc=("optional: location of vectors data, as numpy .npz", + "positional", None, str)) +def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None): + """Compile a vocabulary from a lexicon jsonl file and word vectors.""" + if not lexemes_loc.exists(): + prints(lexemes_loc, title="Can't find lexical data", exits=1) + vectors_loc = ensure_path(vectors_loc) + nlp = spacy.blank(lang) + for word in nlp.vocab: + word.rank = 0 + lex_added = 0 + vec_added = 0 + with lexemes_loc.open() as file_: + for line in file_: + if line.strip(): + attrs = json.loads(line) + if 'settings' in attrs: + nlp.vocab.cfg.update(attrs['settings']) + else: + lex = nlp.vocab[attrs['orth']] + lex.set_attrs(**attrs) + assert lex.rank == attrs['id'] + lex_added += 1 + if vectors_loc is not None: + vector_data = numpy.load(open(vectors_loc, 'rb')) + nlp.vocab.clear_vectors(width=vector_data.shape[1]) + for word in nlp.vocab: + if word.rank: + nlp.vocab.vectors.add(word.orth_, row=word.rank, + vector=vector_data[word.rank]) + vec_added += 1 + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir, + title="Sucessfully compiled vocab and vectors, and saved model") + return nlp diff --git a/spacy/compat.py b/spacy/compat.py index e6b7c066b..7cd06e545 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -6,6 +6,7 @@ import ftfy import sys import ujson import itertools +import locale from thinc.neural.util import copy_array @@ -29,6 +30,10 @@ try: except ImportError: cupy = None +try: + from thinc.neural.optimizers import Optimizer +except ImportError: + from thinc.neural.optimizers import Adam as Optimizer pickle = pickle copy_reg = copy_reg @@ -86,15 +91,15 @@ def symlink_to(orig, dest): def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): - return ((python2 == None or python2 == is_python2) and - (python3 == None or python3 == is_python3) and - (windows == None or windows == is_windows) and - (linux == None or linux == is_linux) and - (osx == None or osx == is_osx)) + return ((python2 is None or python2 == is_python2) and + (python3 is None or python3 == is_python3) and + (windows is None or windows == is_windows) and + (linux is None or linux == is_linux) and + (osx is None or osx == is_osx)) def normalize_string_keys(old): - '''Given a dictionary, make sure keys are unicode strings, not bytes.''' + """Given a dictionary, make sure keys are unicode strings, not bytes.""" new = {} for key, value in old.items(): if isinstance(key, bytes_): @@ -113,3 +118,12 @@ def import_file(name, loc): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module + + +def locale_escape(string, errors='replace'): + ''' + Mangle non-supported characters, for savages with ascii terminals. + ''' + encoding = locale.getpreferredencoding() + string = string.encode(encoding, errors).decode('utf8') + return string diff --git a/spacy/deprecated.py b/spacy/deprecated.py index ad52bfe24..a1143474a 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -24,7 +24,7 @@ def depr_model_download(lang): def resolve_load_name(name, **overrides): - """Resolve model loading if deprecated path kwarg is specified in overrides. + """Resolve model loading if deprecated path kwarg in overrides. name (unicode): Name of model to load. **overrides: Overrides specified in spacy.load(). @@ -32,8 +32,9 @@ def resolve_load_name(name, **overrides): """ if overrides.get('path') not in (None, False, True): name = overrides.get('path') - prints("To load a model from a path, you can now use the first argument. " - "The model meta is used to load the required Language class.", - "OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')", + prints("To load a model from a path, you can now use the first " + "argument. The model meta is used to load the Language class.", + "OLD: spacy.load('en', path='/some/path')", + "NEW: spacy.load('/some/path')", title="Warning: deprecated argument 'path'") return name diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 7c479f94c..e160c31b6 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -12,7 +12,7 @@ IS_JUPYTER = is_in_jupyter() def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, - options={}, manual=False): + options={}, manual=False): """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. @@ -21,7 +21,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, minify (bool): Minify HTML markup. jupyter (bool): Experimental, use Jupyter's `display()` to output markup. options (dict): Visualiser-specific options, e.g. colors. - manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. + manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. RETURNS (unicode): Rendered HTML markup. """ factories = {'dep': (DependencyRenderer, parse_deps), @@ -35,7 +35,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, parsed = [converter(doc, options) for doc in docs] if not manual else docs _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() html = _html['parsed'] - if jupyter: # return HTML rendered by IPython display() + if jupyter: # return HTML rendered by IPython display() from IPython.core.display import display, HTML return display(HTML(html)) return html @@ -50,13 +50,15 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. - manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. + manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. """ from wsgiref import simple_server - render(docs, style=style, page=page, minify=minify, options=options, manual=manual) + render(docs, style=style, page=page, minify=minify, options=options, + manual=manual) httpd = simple_server.make_server('0.0.0.0', port, app) - prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) + prints("Using the '%s' visualizer" % style, + title="Serving on port %d..." % port) try: httpd.serve_forever() except KeyboardInterrupt: @@ -67,7 +69,8 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, def app(environ, start_response): # headers and status need to be bytes in Python 2, see #1227 - headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))] + headers = [(b_to_str(b'Content-type'), + b_to_str(b'text/html; charset=utf-8'))] start_response(b_to_str(b'200 OK'), headers) res = _html['parsed'].encode(encoding='utf-8') return [res] @@ -89,9 +92,9 @@ def parse_deps(orig_doc, options={}): end = word.i + 1 while end < len(doc) and doc[end].is_punct: end += 1 - span = doc[start : end] + span = doc[start:end] spans.append((span.start_char, span.end_char, word.tag_, - word.lemma_, word.ent_type_)) + word.lemma_, word.ent_type_)) for span_props in spans: doc.merge(*span_props) words = [{'text': w.text, 'tag': w.tag_} for w in doc] @@ -113,6 +116,7 @@ def parse_ents(doc, options={}): RETURNS (dict): Generated entities keyed by text (original text) and ents. """ ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} - for ent in doc.ents] - title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None + for ent in doc.ents] + title = (doc.user_data.get('title', None) + if hasattr(doc, 'user_data') else None) return {'text': doc.text, 'ents': ents, 'title': title} diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 1050ffa87..4a494591c 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -14,13 +14,15 @@ class DependencyRenderer(object): """Initialise dependency renderer. options (dict): Visualiser-specific options (compact, word_spacing, - arrow_spacing, arrow_width, arrow_stroke, distance, - offset_x, color, bg, font) + arrow_spacing, arrow_width, arrow_stroke, distance, offset_x, + color, bg, font) """ self.compact = options.get('compact', False) self.word_spacing = options.get('word_spacing', 45) - self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20) - self.arrow_width = options.get('arrow_width', 6 if self.compact else 10) + self.arrow_spacing = options.get('arrow_spacing', + 12 if self.compact else 20) + self.arrow_width = options.get('arrow_width', + 6 if self.compact else 10) self.arrow_stroke = options.get('arrow_stroke', 2) self.distance = options.get('distance', 150 if self.compact else 175) self.offset_x = options.get('offset_x', 50) @@ -39,7 +41,8 @@ class DependencyRenderer(object): rendered = [self.render_svg(i, p['words'], p['arcs']) for i, p in enumerate(parsed)] if page: - content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered]) + content = ''.join([TPL_FIGURE.format(content=svg) + for svg in rendered]) markup = TPL_PAGE.format(content=content) else: markup = ''.join(rendered) @@ -63,12 +66,13 @@ class DependencyRenderer(object): self.id = render_id words = [self.render_word(w['text'], w['tag'], i) for i, w in enumerate(words)] - arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i) + arcs = [self.render_arrow(a['label'], a['start'], + a['end'], a['dir'], i) for i, a in enumerate(arcs)] content = ''.join(words) + ''.join(arcs) - return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height, - color=self.color, bg=self.bg, font=self.font, - content=content) + return TPL_DEP_SVG.format(id=self.id, width=self.width, + height=self.height, color=self.color, + bg=self.bg, font=self.font, content=content) def render_word(self, text, tag, i): """Render individual word. @@ -96,7 +100,7 @@ class DependencyRenderer(object): x_start = self.offset_x+start*self.distance+self.arrow_spacing y = self.offset_y x_end = (self.offset_x+(end-start)*self.distance+start*self.distance - -self.arrow_spacing*(self.highest_level-level)/4) + - self.arrow_spacing*(self.highest_level-level)/4) y_curve = self.offset_y-level*self.distance/2 if self.compact: y_curve = self.offset_y-level*self.distance/6 @@ -133,8 +137,10 @@ class DependencyRenderer(object): if direction is 'left': pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2) else: - pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2) - arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width) + pos1, pos2, pos3 = (end, end+self.arrow_width-2, + end-self.arrow_width+2) + arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, + y-self.arrow_width) return "M{},{} L{},{} {},{}".format(*arrowhead) def get_levels(self, arcs): @@ -159,9 +165,10 @@ class EntityRenderer(object): """ colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74', 'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb', - 'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197', - 'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', - 'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2', + 'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197', + 'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff', + 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2', + 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2', 'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'} colors.update(options.get('colors', {})) self.default_color = '#ddd' @@ -176,9 +183,11 @@ class EntityRenderer(object): minify (bool): Minify HTML markup. RETURNS (unicode): Rendered HTML markup. """ - rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed] + rendered = [self.render_ents(p['text'], p['ents'], + p.get('title', None)) for p in parsed] if page: - docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) + docs = ''.join([TPL_FIGURE.format(content=doc) + for doc in rendered]) markup = TPL_PAGE.format(content=docs) else: markup = ''.join(rendered) diff --git a/spacy/glossary.py b/spacy/glossary.py index ed1c22c21..c17cb7467 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -3,6 +3,16 @@ from __future__ import unicode_literals def explain(term): + """Get a description for a given POS tag, dependency label or entity type. + + term (unicode): The term to explain. + RETURNS (unicode): The explanation, or `None` if not found in the glossary. + + EXAMPLE: + >>> spacy.explain(u'NORP') + >>> doc = nlp(u'Hello world') + >>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc]) + """ if term in GLOSSARY: return GLOSSARY[term] @@ -254,7 +264,6 @@ GLOSSARY = { 'nk': 'noun kernel element', 'nmc': 'numerical component', 'oa': 'accusative object', - 'oa': 'second accusative object', 'oc': 'clausal object', 'og': 'genitive object', 'op': 'prepositional object', @@ -283,6 +292,7 @@ GLOSSARY = { 'PRODUCT': 'Objects, vehicles, foods, etc. (not services)', 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.', 'WORK_OF_ART': 'Titles of books, songs, etc.', + 'LAW': 'Named documents made into laws.', 'LANGUAGE': 'Any named language', 'DATE': 'Absolute or relative dates or periods', 'TIME': 'Times smaller than a day', @@ -290,5 +300,15 @@ GLOSSARY = { 'MONEY': 'Monetary values, including unit', 'QUANTITY': 'Measurements, as of weight or distance', 'ORDINAL': '"first", "second", etc.', - 'CARDINAL': 'Numerals that do not fall under another type' + 'CARDINAL': 'Numerals that do not fall under another type', + + + # Named Entity Recognition + # Wikipedia + # http://www.sciencedirect.com/science/article/pii/S0004370212000276 + # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf + + 'PER': 'Named person or family.', + 'MISC': ('Miscellaneous entities, e.g. events, nationalities, ' + 'products or works of art'), } diff --git a/spacy/gold.pyx b/spacy/gold.pyx index f00d04109..5adef7bf7 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -2,16 +2,15 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import io import re import ujson import random import cytoolz +import itertools from .syntax import nonproj -from .util import ensure_path -from . import util from .tokens import Doc +from . import util def tags_to_entities(tags): @@ -53,7 +52,8 @@ def merge_sents(sents): m_deps[3].extend(head + i for head in heads) m_deps[4].extend(labels) m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) + for b in brackets) i += len(ids) return [(m_deps, m_brackets)] @@ -79,6 +79,8 @@ def align(cand_words, gold_words): punct_re = re.compile(r'\W') + + def _min_edit_path(cand_words, gold_words): cdef: Pool mem @@ -97,9 +99,9 @@ def _min_edit_path(cand_words, gold_words): mem = Pool() n_cand = len(cand_words) n_gold = len(gold_words) - # Levenshtein distance, except we need the history, and we may want different - # costs. - # Mark operations with a string, and score the history using _edit_cost. + # Levenshtein distance, except we need the history, and we may want + # different costs. Mark operations with a string, and score the history + # using _edit_cost. previous_row = [] prev_costs = mem.alloc(n_gold + 1, sizeof(int)) curr_costs = mem.alloc(n_gold + 1, sizeof(int)) @@ -143,12 +145,16 @@ def _min_edit_path(cand_words, gold_words): def minibatch(items, size=8): - '''Iterate over batches of items. `size` may be an iterator, + """Iterate over batches of items. `size` may be an iterator, so that batch-size can vary on each step. - ''' + """ + if isinstance(size, int): + size_ = itertools.repeat(8) + else: + size_ = size items = iter(items) while True: - batch_size = next(size) #if hasattr(size, '__next__') else size + batch_size = next(size_) batch = list(cytoolz.take(int(batch_size), items)) if len(batch) == 0: break @@ -163,6 +169,7 @@ class GoldCorpus(object): train_path (unicode or Path): File or directory of training data. dev_path (unicode or Path): File or directory of development data. + RETURNS (GoldCorpus): The newly created object. """ self.train_path = util.ensure_path(train_path) self.dev_path = util.ensure_path(dev_path) @@ -208,7 +215,7 @@ class GoldCorpus(object): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( - self.train_tuples) + self.train_tuples, label_freq_cutoff=100) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, @@ -217,7 +224,6 @@ class GoldCorpus(object): def dev_docs(self, nlp, gold_preproc=False): gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) - #gold_docs = nlp.preprocess_gold(gold_docs) yield from gold_docs @classmethod @@ -228,7 +234,6 @@ class GoldCorpus(object): raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=noise_level) golds = cls._make_golds(docs, paragraph_tuples) @@ -243,17 +248,20 @@ class GoldCorpus(object): raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples] + return [Doc(nlp.vocab, + words=add_noise(sent_tuples[1], noise_level)) + for (sent_tuples, brackets) in paragraph_tuples] @classmethod def _make_golds(cls, docs, paragraph_tuples): assert len(docs) == len(paragraph_tuples) if len(docs) == 1: - return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] + return [GoldParse.from_annot_tuples(docs[0], + paragraph_tuples[0][0])] else: return [GoldParse.from_annot_tuples(doc, sent_tuples) - for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] + for doc, (sent_tuples, brackets) + in zip(docs, paragraph_tuples)] @staticmethod def walk_corpus(path): @@ -300,7 +308,7 @@ def _corrupt(c, noise_level): def read_json_file(loc, docs_filter=None, limit=None): - loc = ensure_path(loc) + loc = util.ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): yield from read_json_file(loc / filename, limit=limit) @@ -325,16 +333,16 @@ def read_json_file(loc, docs_filter=None, limit=None): for i, token in enumerate(sent['tokens']): words.append(token['orth']) ids.append(i) - tags.append(token.get('tag','-')) - heads.append(token.get('head',0) + i) - labels.append(token.get('dep','')) + tags.append(token.get('tag', '-')) + heads.append(token.get('head', 0) + i) + labels.append(token.get('dep', '')) # Ensure ROOT label is case-insensitive if labels[-1].lower() == 'root': labels[-1] = 'ROOT' ner.append(token.get('ner', '-')) sents.append([ [ids, words, tags, heads, labels, ner], - sent.get('brackets', [])]) + sent.get('brackets', [])]) if sents: yield [paragraph.get('raw', None), sents] @@ -377,28 +385,34 @@ cdef class GoldParse: @classmethod def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): _, words, tags, heads, deps, entities = annot_tuples - return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, - make_projective=make_projective) + return cls(doc, words=words, tags=tags, heads=heads, deps=deps, + entities=entities, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, - deps=None, entities=None, make_projective=False, - cats=tuple()): + def __init__(self, doc, annot_tuples=None, words=None, tags=None, + heads=None, deps=None, entities=None, make_projective=False, + cats=None): """Create a GoldParse. doc (Doc): The document the annotations refer to. words (iterable): A sequence of unicode word strings. tags (iterable): A sequence of strings, representing tag annotations. - heads (iterable): A sequence of integers, representing syntactic head offsets. - deps (iterable): A sequence of strings, representing the syntactic relation types. + heads (iterable): A sequence of integers, representing syntactic + head offsets. + deps (iterable): A sequence of strings, representing the syntactic + relation types. entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. - cats (iterable): A sequence of labels for text classification. Each - label may be a string or an int, or a `(start_char, end_char, label)` + cats (dict): Labels for text classification. Each key in the dictionary + may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). Unlike entity annotations, label annotations can overlap, i.e. a single word can be covered by - multiple labelled spans. + multiple labelled spans. The TextCategorizer component expects + true examples of a label to have the value 1.0, and negative + examples of a label to have the value 0.0. Labels not in the + dictionary are treated as missing - the gradient for those labels + will be zero. RETURNS (GoldParse): The newly constructed object. """ if words is None: @@ -429,7 +443,7 @@ cdef class GoldParse: self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) - self.cats = list(cats) + self.cats = {} if cats is None else dict(cats) self.words = [None] * len(doc) self.tags = [None] * len(doc) self.heads = [None] * len(doc) @@ -462,11 +476,11 @@ cdef class GoldParse: self.ner[i] = entities[gold_i] cycle = nonproj.contains_cycle(self.heads) - if cycle != None: + if cycle is not None: raise Exception("Cycle found: %s" % cycle) if make_projective: - proj_heads,_ = nonproj.projectivize(self.heads, self.labels) + proj_heads, _ = nonproj.projectivize(self.heads, self.labels) self.heads = proj_heads def __len__(self): @@ -489,20 +503,19 @@ cdef class GoldParse: def biluo_tags_from_offsets(doc, entities, missing='O'): - """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out - scheme (BILUO). + """Encode labelled spans into per-token tags, using the + Begin/In/Last/Unit/Out scheme (BILUO). doc (Doc): The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. - entities (iterable): A sequence of `(start, end, label)` triples. `start` and - `end` should be character-offset integers denoting the slice into the - original string. - + entities (iterable): A sequence of `(start, end, label)` triples. `start` + and `end` should be character-offset integers denoting the slice into + the original string. RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". The string "-" is used where the - entity offsets don't align with the tokenization in the `Doc` object. The - training algorithm will view these as missing values. "O" denotes a + entity offsets don't align with the tokenization in the `Doc` object. + The training algorithm will view these as missing values. "O" denotes a non-entity token. "B" denotes the beginning of a multi-token entity, "I" the inside of an entity of three or more tokens, and "L" the end of an entity of two or more tokens. "U" denotes a single-token entity. diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index c2cf12f12..ff560afae 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -16,15 +16,13 @@ from ...util import update_exc class BengaliDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'bn' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS lemma_rules = LEMMA_RULES - - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES class Bengali(Language): diff --git a/spacy/lang/bn/morph_rules.py b/spacy/lang/bn/morph_rules.py index 8561f8676..6ca8fc097 100644 --- a/spacy/lang/bn/morph_rules.py +++ b/spacy/lang/bn/morph_rules.py @@ -12,11 +12,11 @@ MORPH_RULES = { 'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, 'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'}, 'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, - 'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, 'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, 'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, 'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'তুই': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, 'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, 'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'}, 'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, @@ -24,12 +24,15 @@ MORPH_RULES = { 'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, 'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, 'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'তোকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, 'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, 'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'}, 'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, 'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'তোরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, 'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, 'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, 'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'}, 'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, @@ -42,6 +45,10 @@ MORPH_RULES = { 'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'মোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'মোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, 'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, 'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', @@ -50,7 +57,13 @@ MORPH_RULES = { 'Case': 'Nom'}, 'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'তোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, 'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, + 'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, + 'তোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'যাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, } } diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 5b513da7b..ca0ae934a 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -22,7 +22,7 @@ STOP_WORDS = set(""" টি ঠিক তখন তত তথা তবু তবে তা তাঁকে তাঁদের তাঁর তাঁরা তাঁহারা তাই তাও তাকে তাতে তাদের তার তারপর তারা তারই তাহলে তাহা তাহাতে তাহার তিনই -তিনি তিনিও তুমি তুলে তেমন তো তোমার +তিনি তিনিও তুমি তুলে তেমন তো তোমার তুই তোরা তোর তোমাদের তোদের থাকবে থাকবেন থাকা থাকায় থাকে থাকেন থেকে থেকেই থেকেও থাকায় দিকে দিতে দিয়ে দিয়েছে দিয়েছেন দিলেন দিয়ে দু দুটি দুটো দেওয়া দেওয়ার দেখতে দেখা দেখে দেন দেয় দেশের দ্বারা দিয়েছে দিয়েছেন দেয় দেওয়া দেওয়ার দিন দুই @@ -32,7 +32,7 @@ STOP_WORDS = set(""" ফলে ফিরে ফের বছর বদলে বরং বলতে বলল বললেন বলা বলে বলেছেন বলেন বসে বহু বা বাদে বার বিনা বিভিন্ন বিশেষ বিষয়টি বেশ ব্যবহার ব্যাপারে বক্তব্য বন বেশি ভাবে ভাবেই -মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই +মত মতো মতোই মধ্যভাগে মধ্যে মধ্যেই মধ্যেও মনে মাত্র মাধ্যমে মানুষ মানুষের মোট মোটেই মোদের মোর যখন যত যতটা যথেষ্ট যদি যদিও যা যাঁর যাঁরা যাওয়া যাওয়ার যাকে যাচ্ছে যাতে যাদের যান যাবে যায় যার যারা যায় যিনি যে যেখানে যেতে যেন যেমন রকম রয়েছে রাখা রেখে রয়েছে diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 0698cfd43..7ec631c92 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -29,11 +29,19 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' 'TB T G M K %') _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' -_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' -_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' -_hyphens = '- – — -- ---' + +# These expressions contain various unicode variations, including characters +# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language +# conflicts, spaCy's base tokenizer should handle all of those by default +_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · ।' +_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' +_hyphens = '- – — -- --- —— ~' + +# Various symbols like dingbats, but also emoji +# Details: https://www.compart.com/en/unicode/category/So _other_symbols = r'[\p{So}]' + UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 99babdc2c..45e5b89dd 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .morph_rules import MORPH_RULES +from ..tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -13,11 +16,13 @@ from ...util import update_exc, add_lookups class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + # morph_rules = MORPH_RULES + tag_map = TAG_MAP + stop_words = STOP_WORDS class Danish(Language): diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py new file mode 100644 index 000000000..8152ad259 --- /dev/null +++ b/spacy/lang/da/lex_attrs.py @@ -0,0 +1,52 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +# Source http://fjern-uv.dk/tal.php + +_num_words = """nul +en et to tre fire fem seks syv otte ni ti +elleve tolv tretten fjorten femten seksten sytten atten nitten tyve +enogtyve toogtyve treogtyve fireogtyve femogtyve seksogtyve syvogtyve otteogtyve niogtyve tredive +enogtredive toogtredive treogtredive fireogtredive femogtredive seksogtredive syvogtredive otteogtredive niogtredive fyrre +enogfyrre toogfyrre treogfyrre fireogfyrre femgogfyrre seksogfyrre syvogfyrre otteogfyrre niogfyrre halvtreds +enoghalvtreds tooghalvtreds treoghalvtreds fireoghalvtreds femoghalvtreds seksoghalvtreds syvoghalvtreds otteoghalvtreds nioghalvtreds tres +enogtres toogtres treogtres fireogtres femogtres seksogtres syvogtres otteogtres niogtres halvfjerds +enoghalvfjerds tooghalvfjerds treoghalvfjerds fireoghalvfjerds femoghalvfjerds seksoghalvfjerds syvoghalvfjerds otteoghalvfjerds nioghalvfjerds firs +enogfirs toogfirs treogfirs fireogfirs femogfirs seksogfirs syvogfirs otteogfirs niogfirs halvfems +enoghalvfems tooghalvfems treoghalvfems fireoghalvfems femoghalvfems seksoghalvfems syvoghalvfems otteoghalvfems nioghalvfems hundrede +million milliard billion billiard trillion trilliard +""".split() + +# source http://www.duda.dk/video/dansk/grammatik/talord/talord.html + +_ordinal_words = """nulte +første anden tredje fjerde femte sjette syvende ottende niende tiende +elfte tolvte trettende fjortende femtende sekstende syttende attende nittende tyvende +enogtyvende toogtyvende treogtyvende fireogtyvende femogtyvende seksogtyvende syvogtyvende otteogtyvende niogtyvende tredivte enogtredivte toogtredivte treogtredivte fireogtredivte femogtredivte seksogtredivte syvogtredivte otteogtredivte niogtredivte fyrretyvende +enogfyrretyvende toogfyrretyvende treogfyrretyvende fireogfyrretyvende femogfyrretyvende seksogfyrretyvende syvogfyrretyvende otteogfyrretyvende niogfyrretyvende halvtredsindstyvende enoghalvtredsindstyvende +tooghalvtredsindstyvende treoghalvtredsindstyvende fireoghalvtredsindstyvende femoghalvtredsindstyvende seksoghalvtredsindstyvende syvoghalvtredsindstyvende otteoghalvtredsindstyvende nioghalvtredsindstyvende +tresindstyvende enogtresindstyvende toogtresindstyvende treogtresindstyvende fireogtresindstyvende femogtresindstyvende seksogtresindstyvende syvogtresindstyvende otteogtresindstyvende niogtresindstyvende halvfjerdsindstyvende +enoghalvfjerdsindstyvende tooghalvfjerdsindstyvende treoghalvfjerdsindstyvende fireoghalvfjerdsindstyvende femoghalvfjerdsindstyvende seksoghalvfjerdsindstyvende syvoghalvfjerdsindstyvende otteoghalvfjerdsindstyvende nioghalvfjerdsindstyvende firsindstyvende +enogfirsindstyvende toogfirsindstyvende treogfirsindstyvende fireogfirsindstyvende femogfirsindstyvende seksogfirsindstyvende syvogfirsindstyvende otteogfirsindstyvende niogfirsindstyvende halvfemsindstyvende +enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireoghalvfemsindstyvende femoghalvfemsindstyvende seksoghalvfemsindstyvende syvoghalvfemsindstyvende otteoghalvfemsindstyvende nioghalvfemsindstyvende +""".split() + +def like_num(text): + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + return False + +LEX_ATTRS = { + LIKE_NUM: like_num +} diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py new file mode 100644 index 000000000..b365bf871 --- /dev/null +++ b/spacy/lang/da/morph_rules.py @@ -0,0 +1,41 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import LEMMA +from ...deprecated import PRON_LEMMA + +MORPH_RULES = { + "PRON": { + "jeg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "mig": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "du": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "ham": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "hun": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, + "hende": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, + "den": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "det": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "vi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, + "os": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, + "de": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, + "dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, + + "min": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "din": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "hans": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, + "hendes": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, + "dens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "dets": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "vores": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + "deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + }, + + "VERB": { + "er": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Pres"}, + "var": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Past"} + } +} + +for tag, rules in MORPH_RULES.items(): + for key, attrs in dict(rules).items(): + rules[key.title()] = attrs diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index ac2195f10..ba448f8f3 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,47 +1,46 @@ # encoding: utf8 from __future__ import unicode_literals - -# Source: https://github.com/stopwords-iso/stopwords-da +# Source: Handpicked by Jens Dahl Møllerhøj. STOP_WORDS = set(""" -ad af aldrig alle alt anden andet andre at +af aldrig alene alle allerede alligevel alt altid anden andet andre at -bare begge blev blive bliver +bag begge blandt blev blive bliver burde bør -da de dem den denne der deres det dette dig din dine disse dit dog du +da de dem den denne dens der derefter deres derfor derfra deri dermed derpå derved det dette dig din dine disse dog du -efter ej eller en end ene eneste enhver er et +efter egen eller ellers en end endnu ene eneste enhver ens enten er et -far fem fik fire flere fleste for fordi forrige fra få får før +flere flest fleste for foran fordi forrige fra få før først -god godt +gennem gjorde gjort god gør gøre gørende -ham han hans har havde have hej helt hende hendes her hos hun hvad hvem hver -hvilken hvis hvor hvordan hvorfor hvornår +ham han hans har havde have hel heller hen hende hendes henover her herefter heri hermed herpå hun hvad hvem hver hvilke hvilken hvilkes hvis hvor hvordan hvorefter hvorfor hvorfra hvorhen hvori hvorimod hvornår hvorved -i ikke ind ingen intet +i igen igennem ikke imellem imens imod ind indtil ingen intet -ja jeg jer jeres jo +jeg jer jeres jo -kan kom komme kommer kun kunne +kan kom kommer kun kunne -lad lav lidt lige lille +lad langs lav lave lavet lidt lige ligesom lille længere -man mand mange med meget men mens mere mig min mine mit mod må +man mange med meget mellem men mens mere mest mig min mindre mindst mine mit må måske -ned nej ni nogen noget nogle nu ny nyt når nær næste næsten +ned nemlig nogen nogensinde noget nogle nok nu ny nyt nær næste næsten -og også okay om op os otte over +og også om omkring op os over overalt på -se seks selv ser ses sig sige sin sine sit skal skulle som stor store syv så -sådan +samme sammen selv selvom senere ses siden sig sige skal skulle som stadig synes syntes så sådan således -tag tage thi ti til to tre +temmelig tidligere til tilbage tit -ud under +ud uden udover under undtagen -var ved vi vil ville vor vores være været +var ved vi via vil ville vore vores vær være været + +øvrigt """.split()) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index fbfbbad86..6bf9ab669 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,11 +1,27 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA +from ...symbols import ORTH, LEMMA, NORM _exc = {} +for exc_data in [ + {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, + + {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, + {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, + {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, + {ORTH: "Apr.", LEMMA: "april", NORM: "april"}, + {ORTH: "Maj.", LEMMA: "maj", NORM: "maj"}, + {ORTH: "Jun.", LEMMA: "juni", NORM: "juni"}, + {ORTH: "Jul.", LEMMA: "juli", NORM: "juli"}, + {ORTH: "Aug.", LEMMA: "august", NORM: "august"}, + {ORTH: "Sep.", LEMMA: "september", NORM: "september"}, + {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, + {ORTH: "Nov.", LEMMA: "november", NORM: "november"}, + {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)] for orth in [ "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index b8a7580a0..e8e7a12db 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP @@ -11,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -21,15 +21,12 @@ class GermanDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + infixes = TOKENIZER_INFIXES + tag_map = TAG_MAP + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class German(Language): diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py new file mode 100644 index 000000000..7024ed118 --- /dev/null +++ b/spacy/lang/de/punctuation.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER + + +_quotes = QUOTES.replace("'", '') + +_infixes = (LIST_ELLIPSES + LIST_ICONS + + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), + r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), + r'(?<=[0-9])-(?=[0-9])']) + + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index d16bd17e0..730c15cfc 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -62,5 +62,5 @@ TAG_MAP = { "VVIZU": {POS: VERB, "VerbForm": "inf"}, "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, "XY": {POS: X}, - "SP": {POS: SPACE} + "_SP": {POS: SPACE} } diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index ec14fecd0..a95e501e1 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -7,7 +7,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES -from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC +from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS @@ -16,22 +16,24 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups +def _return_en(_): + return 'en' class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters.update(LEX_ATTRS) - lex_attr_getters[LANG] = lambda text: 'en' + lex_attr_getters[LANG] = _return_en lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - morph_rules = dict(MORPH_RULES) - lemma_rules = dict(LEMMA_RULES) - lemma_index = dict(LEMMA_INDEX) - lemma_exc = dict(LEMMA_EXC) - syntax_iterators = dict(SYNTAX_ITERATORS) + tag_map = TAG_MAP + stop_words = STOP_WORDS + morph_rules = MORPH_RULES + lemma_rules = LEMMA_RULES + lemma_index = LEMMA_INDEX + lemma_exc = LEMMA_EXC + lemma_lookup = LOOKUP + syntax_iterators = SYNTAX_ITERATORS class English(Language): diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 640940fea..394731ff1 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -16,7 +16,7 @@ call can cannot ca could did do does doing done down due during -each eight either eleven else elsewhere empty enough etc even ever every +each eight either eleven else elsewhere empty enough even ever every everyone everything everywhere except few fifteen fifty first five for former formerly forty four from front full @@ -27,7 +27,7 @@ get give go had has have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred -i if in inc indeed into is it its itself +i if in indeed into is it its itself keep diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 4240bd657..bb1a6b7f7 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -8,7 +8,7 @@ def noun_chunks(obj): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. """ - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos', 'attr', 'ROOT'] doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings.add(label) for label in labels] diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index a674c17e3..fc3d2cc93 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -42,6 +42,7 @@ TAG_MAP = { "RBR": {POS: ADV, "Degree": "comp"}, "RBS": {POS: ADV, "Degree": "sup"}, "RP": {POS: PART}, + "SP": {POS: SPACE}, "SYM": {POS: SYM}, "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, "UH": {POS: INTJ}, @@ -55,11 +56,11 @@ TAG_MAP = { "WP": {POS: NOUN, "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, "XX": {POS: X}, "BES": {POS: VERB}, - "HVS": {POS: VERB} + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, } diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 1e7f55be8..661f0bbec 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - sytax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + tag_map = TAG_MAP + stop_words = STOP_WORDS + sytax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class Spanish(Language): diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 86dd48620..2095d23b1 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -303,5 +303,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, - "SP": {"morph": "_", "pos": "SPACE"}, + "_SP": {"morph": "_", "pos": "SPACE"}, } diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 931ad5341..7f74495c5 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Finnish(Language): diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index a243b6268..42acd0736 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -4,32 +4,29 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from .lemmatizer import LOOKUP from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups class FrenchDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - infixes = tuple(TOKENIZER_INFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES token_match = TOKEN_MATCH - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class French(Language): diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py new file mode 100644 index 000000000..41c509dff --- /dev/null +++ b/spacy/lang/fr/lex_attrs.py @@ -0,0 +1,41 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = set(""" +zero un deux trois quatre cinq six sept huit neuf dix +onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf +vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante +cent mille mil million milliard billion quadrillion quintillion +sextillion septillion octillion nonillion decillion +""".split()) + +_ordinal_words = set(""" +premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième +onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième +vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième +centième millième millionnième milliardième billionnième quadrillionnième quintillionnième +sextillionnième septillionnième octillionnième nonillionnième decillionnième +""".split()) + + +def like_num(text): + # Might require more work? + # See this discussion: https://github.com/explosion/spaCy/pull/1161 + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = { + LIKE_NUM: like_num +} diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index a15dc9a05..807794fee 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -12,9 +12,8 @@ from ...util import update_exc class HebrewDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'he' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Hebrew(Language): diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py new file mode 100644 index 000000000..0503b5b7f --- /dev/null +++ b/spacy/lang/hi/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS + +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG + + +class HindiDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: 'hi' + stop_words = STOP_WORDS + + +class Hindi(Language): + lang = 'hi' + Defaults = HindiDefaults + + +__all__ = ['Hindi'] diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py new file mode 100644 index 000000000..8886e26c3 --- /dev/null +++ b/spacy/lang/hi/lex_attrs.py @@ -0,0 +1,38 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..norm_exceptions import BASE_NORMS +from ...attrs import NORM +from ...util import add_lookups + + +_stem_suffixes = [ + ["ो","े","ू","ु","ी","ि","ा"], + ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"], + ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"], + ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"], + ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"] +] + + +def norm(string): + # normalise base exceptions, e.g. punctuation or currency symbols + if string in BASE_NORMS: + return BASE_NORMS[string] + # set stem word as norm, if available, adapted from: + # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf + # http://research.variancia.com/hindi_stemmer/ + # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142 + for suffix_group in reversed(_stem_suffixes): + length = len(suffix_group[0]) + if len(string) <= length: + break + for suffix in suffix_group: + if string.endswith(suffix): + return string[:-length] + return string + + +LEX_ATTRS = { + NORM: norm +} diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py new file mode 100644 index 000000000..2ff27c015 --- /dev/null +++ b/spacy/lang/hi/stop_words.py @@ -0,0 +1,177 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt + +STOP_WORDS = set(""" +अत +अपना +अपनी +अपने +अभी +अंदर +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नके +नहीं +ना +निहायत +नीचे +ने +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वग़ैरह +वर्ग +वह +वहाँ +वहीं +वाले +वुह +वे +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +संग +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +""".split()) diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 0fe6a9f5c..35b047900 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES token_match = TOKEN_MATCH - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = LOOKUP class Hungarian(Language): diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py new file mode 100644 index 000000000..718d7d536 --- /dev/null +++ b/spacy/lang/hu/examples.py @@ -0,0 +1,17 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.hu.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.", + "San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.", + "London az Egyesült Királyság egy nagy városa." +] diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index e0cfa941d..2f21e73cf 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG from ...util import update_exc @@ -19,19 +18,14 @@ from ...util import update_exc class IndonesianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'id' - lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class Indonesian(Language): diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index f6acd8508..fb6a31f99 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', 'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', 'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', - 'noniliun', 'desiliun', - ] + 'noniliun', 'desiliun'] def like_num(text): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 7cc717cb3..6bc47ce92 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + lemma_lookup = LOOKUP class Italian(Language): diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 09ad9945e..04cc013a4 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -4,19 +4,36 @@ from __future__ import unicode_literals, print_function from ...language import Language from ...attrs import LANG from ...tokens import Doc +from ...tokenizer import Tokenizer + + +class JapaneseTokenizer(object): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + try: + from janome.tokenizer import Tokenizer + except ImportError: + raise ImportError("The Japanese tokenizer requires the Janome " + "library: https://github.com/mocobeta/janome") + self.tokenizer = Tokenizer() + + def __call__(self, text): + words = [x.surface for x in self.tokenizer.tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + + +class JapaneseDefaults(Language.Defaults): + @classmethod + def create_tokenizer(cls, nlp=None): + return JapaneseTokenizer(cls, nlp) class Japanese(Language): lang = 'ja' + Defaults = JapaneseDefaults def make_doc(self, text): - try: - from janome.tokenizer import Tokenizer - except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: " - "https://github.com/mocobeta/janome") - words = [x.surface for x in Tokenizer().tokenize(text)] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + return self.tokenizer(text) __all__ = ['Japanese'] diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py new file mode 100644 index 000000000..623609205 --- /dev/null +++ b/spacy/lang/ja/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ja.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + 'アップルがイギリスの新興企業を10億ドルで購入を検討', + '自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める', + '歩道を走る自動配達ロボ、サンフランシスコ市が走行禁止を検討', + 'ロンドンはイギリスの大都市です。' +] diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index d4beebd26..f0363b05f 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -122,22 +122,35 @@ def word_shape(text): shape.append(shape_char) return ''.join(shape) +def lower(string): return string.lower() +def prefix(string): return string[0] +def suffix(string): return string[-3:] +def cluster(string): return 0 +def is_alpha(string): return string.isalpha() +def is_digit(string): return string.isdigit() +def is_lower(string): return string.islower() +def is_space(string): return string.isspace() +def is_title(string): return string.istitle() +def is_upper(string): return string.isupper() +def is_stop(string, stops=set()): return string in stops +def is_oov(string): return True +def get_prob(string): return -20. LEX_ATTRS = { - attrs.LOWER: lambda string: string.lower(), - attrs.NORM: lambda string: string.lower(), - attrs.PREFIX: lambda string: string[0], - attrs.SUFFIX: lambda string: string[-3:], - attrs.CLUSTER: lambda string: 0, - attrs.IS_ALPHA: lambda string: string.isalpha(), - attrs.IS_DIGIT: lambda string: string.isdigit(), - attrs.IS_LOWER: lambda string: string.islower(), - attrs.IS_SPACE: lambda string: string.isspace(), - attrs.IS_TITLE: lambda string: string.istitle(), - attrs.IS_UPPER: lambda string: string.isupper(), - attrs.IS_STOP: lambda string: False, - attrs.IS_OOV: lambda string: True, - attrs.PROB: lambda string: -20., + attrs.LOWER: lower, + attrs.NORM: lower, + attrs.PREFIX: prefix, + attrs.SUFFIX: suffix, + attrs.CLUSTER: cluster, + attrs.IS_ALPHA: is_alpha, + attrs.IS_DIGIT: is_digit, + attrs.IS_LOWER: is_lower, + attrs.IS_SPACE: is_space, + attrs.IS_TITLE: is_title, + attrs.IS_UPPER: is_upper, + attrs.IS_STOP: is_stop, + attrs.IS_OOV: is_oov, + attrs.PROB: get_prob, attrs.LIKE_EMAIL: like_email, attrs.LIKE_NUM: like_num, attrs.IS_PUNCT: is_punct, diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index c1b4af263..4250e6809 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Norwegian(Language): diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 7b948f295..13786a7bc 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -12,11 +13,11 @@ from ...util import update_exc, add_lookups class DutchDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Dutch(Language): diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py new file mode 100644 index 000000000..08b1df3be --- /dev/null +++ b/spacy/lang/nl/lex_attrs.py @@ -0,0 +1,40 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = set(""" +nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien +veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd +duizend miljoen miljard biljoen biljard triljoen triljard +""".split()) + +_ordinal_words = set(""" +eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde +twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste +zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste +miljardste biljoenste biljardste triljoenste triljardste +""".split()) + + +def like_num(text): + # This only does the most basic check for whether a token is a digit + # or matches one of the number words. In order to handle numbers like + # "drieëntwintig", more work is required. + # See this discussion: https://github.com/explosion/spaCy/pull/1177 + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = { + LIKE_NUM: like_num +} diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py index b02dda2c8..7857a16bf 100644 --- a/spacy/lang/norm_exceptions.py +++ b/spacy/lang/norm_exceptions.py @@ -31,11 +31,21 @@ BASE_NORMS = { "„": '"', "»": '"', "«": '"', + "‘‘": '"', + "’’": '"', + "?": "?", + "!": "!", + ",": ",", + ";": ";", + ":": ":", + "。": ".", + "।": ".", "…": "...", "—": "-", "–": "-", "--": "-", "---": "-", + "——": "-", "€": "$", "£": "$", "¥": "$", diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 38a240598..80011f9d8 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Polish(Language): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 67539034d..2a8323597 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + lemma_lookup = LOOKUP class Portuguese(Language): diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 2d3a640c5..224c105d7 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES + lemma_lookup = LOOKUP class Swedish(Language): diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py new file mode 100644 index 000000000..bedec46c8 --- /dev/null +++ b/spacy/lang/th/__init__.py @@ -0,0 +1,38 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...tokens import Doc +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class ThaiDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'th' + tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) + tag_map = TAG_MAP + stop_words = STOP_WORDS + + +class Thai(Language): + lang = 'th' + Defaults = ThaiDefaults + + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + + +__all__ = ['Thai'] diff --git a/spacy/lang/th/stop_words.py b/spacy/lang/th/stop_words.py new file mode 100644 index 000000000..e13dec984 --- /dev/null +++ b/spacy/lang/th/stop_words.py @@ -0,0 +1,62 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt +# stop words as whitespace-separated list +STOP_WORDS = set(""" +นี้ นํา นั้น นัก นอกจาก ทุก ที่สุด ที่ ทําให้ ทํา ทาง ทั้งนี้ ดัง ซึ่ง ช่วง จาก จัด จะ คือ ความ ครั้ง คง ขึ้น ของ +ขอ รับ ระหว่าง รวม ยัง มี มาก มา พร้อม พบ ผ่าน ผล บาง น่า เปิดเผย เปิด เนื่องจาก เดียวกัน เดียว เช่น เฉพาะ เข้า ถ้า +ถูก ถึง ต้อง ต่างๆ ต่าง ต่อ ตาม ตั้งแต่ ตั้ง ด้าน ด้วย อีก อาจ ออก อย่าง อะไร อยู่ อยาก หาก หลาย หลังจาก แต่ เอง เห็น +เลย เริ่ม เรา เมื่อ เพื่อ เพราะ เป็นการ เป็น หลัง หรือ หนึ่ง ส่วน ส่ง สุด สําหรับ ว่า ลง ร่วม ราย ขณะ ก่อน ก็ การ กับ กัน +กว่า กล่าว จึง ไว้ ไป ได้ ให้ ใน โดย แห่ง แล้ว และ แรก แบบ ๆ ทั้ง วัน เขา เคย ไม่ อยาก เกิน เกินๆ เกี่ยวกัน เกี่ยวกับ +เกี่ยวข้อง เกี่ยวเนื่อง เกี่ยวๆ เกือบ เกือบจะ เกือบๆ แก แก่ แก้ไข ใกล้ ใกล้ๆ ไกล ไกลๆ ขณะเดียวกัน ขณะใด ขณะใดๆ ขณะที่ ขณะนั้น ขณะนี้ ขณะหนึ่ง ขวาง +ขวางๆ ขั้น ใคร ใคร่ ใคร่จะ ใครๆ ง่าย ง่ายๆ ไง จง จด จน จนกระทั่ง จนกว่า จนขณะนี้ จนตลอด จนถึง จนทั่ว จนบัดนี้ จนเมื่อ จนแม้ จนแม้น +จรด จรดกับ จริง จริงจัง จริงๆ จริงๆจังๆ จวน จวนจะ จวนเจียน จวบ ซึ่งก็ ซึ่งก็คือ ซึ่งกัน ซึ่งกันและกัน ซึ่งได้แก่ ซึ่งๆ ณ ด้วย ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยประการฉะนี้ +ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดั่ง ดังกล่าว ดังกับ ดั่งกับ ดังกับว่า ดั่งกับว่า ดังเก่า +ดั่งเก่า ดังเคย ใดๆ ได้ ได้แก่ ได้แต่ ได้ที่ ได้มา ได้รับ ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถึง ตลอดทั้ง +ตลอดทั่ว ตลอดทั่วถึง ตลอดทั่วทั้ง ตลอดปี ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดวัน ตลอดเวลา ตลอดศก ต่อ ต่อกัน ถึงแก่ ถึงจะ ถึงบัดนั้น ถึงบัดนี้ +ถึงเมื่อ ถึงเมื่อใด ถึงเมื่อไร ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือ ถือว่า ถูกต้อง ถูกๆ เถอะ เถิด ทรง ทว่า ทั้งคน ทั้งตัว ทั้งที ทั้งที่ ทั้งนั้น ทั้งนั้นด้วย ทั้งนั้นเพราะ +นอก นอกจากที่ นอกจากนั้น นอกจากนี้ นอกจากว่า นอกนั้น นอกเหนือ นอกเหนือจาก น้อย น้อยกว่า น้อยๆ นะ น่ะ นักๆ นั่น นั่นไง นั่นเป็น นั่นแหละ +นั่นเอง นั้นๆ นับ นับจากนั้น นับจากนี้ นับตั้งแต่ นับแต่ นับแต่ที่ นับแต่นั้น เป็นต้น เป็นต้นไป เป็นต้นมา เป็นแต่ เป็นแต่เพียง เป็นที เป็นที่ เป็นที่สุด เป็นเพราะ +เป็นเพราะว่า เป็นเพียง เป็นเพียงว่า เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอันว่า เป็นอันๆ เป็นอาทิ เป็นๆ เปลี่ยน เปลี่ยนแปลง เปิด เปิดเผย ไป่ ผ่าน ผ่านๆ +ผิด ผิดๆ ผู้ เพียงเพื่อ เพียงไร เพียงไหน เพื่อที่ เพื่อที่จะ เพื่อว่า เพื่อให้ ภาค ภาคฯ ภาย ภายใต้ ภายนอก ภายใน ภายภาค ภายภาคหน้า ภายหน้า ภายหลัง +มอง มองว่า มัก มักจะ มัน มันๆ มั้ย มั้ยนะ มั้ยนั่น มั้ยเนี่ย มั้ยล่ะ ยืนนาน ยืนยง ยืนยัน ยืนยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร็ว ร่วม รวมกัน ร่วมกัน +รวมด้วย ร่วมด้วย รวมถึง รวมทั้ง ร่วมมือ รวมๆ ระยะ ระยะๆ ระหว่าง รับรอง รึ รึว่า รือ รือว่า สิ้นกาลนาน สืบเนื่อง สุดๆ สู่ สูง สูงกว่า สูงส่ง สูงสุด สูงๆ เสมือนกับ +เสมือนว่า เสร็จ เสร็จกัน เสร็จแล้ว เสร็จสมบูรณ์ เสร็จสิ้น เสีย เสียก่อน เสียจน เสียจนกระทั่ง เสียจนถึง เสียด้วย เสียนั่น เสียนั่นเอง เสียนี่ เสียนี่กระไร เสียยิ่ง +เสียยิ่งนัก เสียแล้ว ใหญ่ๆ ให้ดี ให้แด่ ให้ไป ใหม่ ให้มา ใหม่ๆ ไหน ไหนๆ อดีต อนึ่ง อย่าง อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น +อย่างนี้ อย่างโน้น ก็คือ ก็แค่ ก็จะ ก็ดี ก็ได้ ก็ต่อเมื่อ ก็ตาม ก็ตามแต่ ก็ตามที ก็แล้วแต่ กระทั่ง กระทำ กระนั้น กระผม กลับ กล่าวคือ กลุ่ม กลุ่มก้อน +กลุ่มๆ กว้าง กว้างขวาง กว้างๆ ก่อนหน้า ก่อนหน้านี้ ก่อนๆ กันดีกว่า กันดีไหม กันเถอะ กันนะ กันและกัน กันไหม กันเอง กำลัง กำลังจะ กำหนด กู เก็บ +เกิด เกี่ยวข้อง แก่ แก้ไข ใกล้ ใกล้ๆ ข้า ข้าง ข้างเคียง ข้างต้น ข้างบน ข้างล่าง ข้างๆ ขาด ข้าพเจ้า ข้าฯ เข้าใจ เขียน คงจะ คงอยู่ ครบ ครบครัน ครบถ้วน +ครั้งกระนั้น ครั้งก่อน ครั้งครา ครั้งคราว ครั้งใด ครั้งที่ ครั้งนั้น ครั้งนี้ ครั้งละ ครั้งหนึ่ง ครั้งหลัง ครั้งหลังสุด ครั้งไหน ครั้งๆ ครัน ครับ ครา คราใด คราที่ ครานั้น ครานี้ คราหนึ่ง +คราไหน คราว คราวก่อน คราวใด คราวที่ คราวนั้น คราวนี้ คราวโน้น คราวละ คราวหน้า คราวหนึ่ง คราวหลัง คราวไหน คราวๆ คล้าย คล้ายกัน คล้ายกันกับ +คล้ายกับ คล้ายกับว่า คล้ายว่า ควร ค่อน ค่อนข้าง ค่อนข้างจะ ค่อยไปทาง ค่อนมาทาง ค่อย ค่อยๆ คะ ค่ะ คำ คิด คิดว่า คุณ คุณๆ +เคยๆ แค่ แค่จะ แค่นั้น แค่นี้ แค่เพียง แค่ว่า แค่ไหน ใคร่ ใคร่จะ ง่าย ง่ายๆ จนกว่า จนแม้ จนแม้น จังๆ จวบกับ จวบจน จ้ะ จ๊ะ จะได้ จัง จัดการ จัดงาน จัดแจง +จัดตั้ง จัดทำ จัดหา จัดให้ จับ จ้า จ๋า จากนั้น จากนี้ จากนี้ไป จำ จำเป็น จำพวก จึงจะ จึงเป็น จู่ๆ ฉะนั้น ฉะนี้ ฉัน เฉกเช่น เฉย เฉยๆ ไฉน ช่วงก่อน +ช่วงต่อไป ช่วงถัดไป ช่วงท้าย ช่วงที่ ช่วงนั้น ช่วงนี้ ช่วงระหว่าง ช่วงแรก ช่วงหน้า ช่วงหลัง ช่วงๆ ช่วย ช้า ช้านาน ชาว ช้าๆ เช่นก่อน เช่นกัน เช่นเคย +เช่นดัง เช่นดังก่อน เช่นดังเก่า เช่นดังที่ เช่นดังว่า เช่นเดียวกัน เช่นเดียวกับ เช่นใด เช่นที่ เช่นที่เคย เช่นที่ว่า เช่นนั้น เช่นนั้นเอง เช่นนี้ เช่นเมื่อ เช่นไร เชื่อ +เชื่อถือ เชื่อมั่น เชื่อว่า ใช่ ใช่ไหม ใช้ ซะ ซะก่อน ซะจน ซะจนกระทั่ง ซะจนถึง ซึ่งได้แก่ ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น +ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดังกล่าว ดังกับว่า ดั่งกับว่า ดังเก่า ดั่งเก่า ดั่งเคย ต่างก็ ต่างหาก ตามด้วย ตามแต่ ตามที่ +ตามๆ เต็มไปด้วย เต็มไปหมด เต็มๆ แต่ก็ แต่ก่อน แต่จะ แต่เดิม แต่ต้อง แต่ถ้า แต่ทว่า แต่ที่ แต่นั้น แต่เพียง แต่เมื่อ แต่ไร แต่ละ แต่ว่า แต่ไหน แต่อย่างใด โต +โตๆ ใต้ ถ้าจะ ถ้าหาก ถึงแก่ ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือว่า ถูกต้อง ทว่า ทั้งนั้นด้วย ทั้งปวง ทั้งเป็น ทั้งมวล ทั้งสิ้น ทั้งหมด ทั้งหลาย ทั้งๆ ทัน +ทันใดนั้น ทันที ทันทีทันใด ทั่ว ทำไม ทำไร ทำให้ ทำๆ ที ที่จริง ที่ซึ่ง ทีเดียว ทีใด ที่ใด ที่ได้ ทีเถอะ ที่แท้ ที่แท้จริง ที่นั้น ที่นี้ ทีไร ทีละ ที่ละ +ที่แล้ว ที่ว่า ที่แห่งนั้น ที่ไหน ทีๆ ที่ๆ ทุกคน ทุกครั้ง ทุกครา ทุกคราว ทุกชิ้น ทุกตัว ทุกทาง ทุกที ทุกที่ ทุกเมื่อ ทุกวัน ทุกวันนี้ ทุกสิ่ง ทุกหน ทุกแห่ง ทุกอย่าง +ทุกอัน ทุกๆ เท่า เท่ากัน เท่ากับ เท่าใด เท่าที่ เท่านั้น เท่านี้ เท่าไร เท่าไหร่ แท้ แท้จริง เธอ นอกจากว่า น้อย น้อยกว่า น้อยๆ น่ะ นั้นไว นับแต่นี้ นาง +นางสาว น่าจะ นาน นานๆ นาย นำ นำพา นำมา นิด นิดหน่อย นิดๆ นี่ นี่ไง นี่นา นี่แน่ะ นี่แหละ นี้แหล่ นี่เอง นี้เอง นู่น นู้น เน้น เนี่ย +เนี่ยเอง ในช่วง ในที่ ในเมื่อ ในระหว่าง บน บอก บอกแล้ว บอกว่า บ่อย บ่อยกว่า บ่อยครั้ง บ่อยๆ บัดดล บัดเดี๋ยวนี้ บัดนั้น บัดนี้ บ้าง บางกว่า +บางขณะ บางครั้ง บางครา บางคราว บางที บางที่ บางแห่ง บางๆ ปฏิบัติ ประกอบ ประการ ประการฉะนี้ ประการใด ประการหนึ่ง ประมาณ ประสบ ปรับ +ปรากฏ ปรากฏว่า ปัจจุบัน ปิด เป็นด้วย เป็นดัง เป็นต้น เป็นแต่ เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอาทิ ผ่านๆ ผู้ ผู้ใด เผื่อ เผื่อจะ เผื่อที่ เผื่อว่า ฝ่าย +ฝ่ายใด พบว่า พยายาม พร้อมกัน พร้อมกับ พร้อมด้วย พร้อมทั้ง พร้อมที่ พร้อมเพียง พวก พวกกัน พวกกู พวกแก พวกเขา พวกคุณ พวกฉัน พวกท่าน +พวกที่ พวกเธอ พวกนั้น พวกนี้ พวกนู้น พวกโน้น พวกมัน พวกมึง พอ พอกัน พอควร พอจะ พอดี พอตัว พอที พอที่ พอเพียง พอแล้ว พอสม พอสมควร +พอเหมาะ พอๆ พา พึง พึ่ง พื้นๆ พูด เพราะฉะนั้น เพราะว่า เพิ่ง เพิ่งจะ เพิ่ม เพิ่มเติม เพียง เพียงแค่ เพียงใด เพียงแต่ เพียงพอ เพียงเพราะ +เพื่อว่า เพื่อให้ ภายใต้ มองว่า มั๊ย มากกว่า มากมาย มิ มิฉะนั้น มิใช่ มิได้ มีแต่ มึง มุ่ง มุ่งเน้น มุ่งหมาย เมื่อก่อน เมื่อครั้ง เมื่อครั้งก่อน +เมื่อคราวก่อน เมื่อคราวที่ เมื่อคราว เมื่อคืน เมื่อเช้า เมื่อใด เมื่อนั้น เมื่อนี้ เมื่อเย็น เมื่อไร เมื่อวันวาน เมื่อวาน เมื่อไหร่ แม้ แม้กระทั่ง แม้แต่ แม้นว่า แม้ว่า +ไม่ค่อย ไม่ค่อยจะ ไม่ค่อยเป็น ไม่ใช่ ไม่เป็นไร ไม่ว่า ยก ยกให้ ยอม ยอมรับ ย่อม ย่อย ยังคง ยังงั้น ยังงี้ ยังโง้น ยังไง ยังจะ ยังแต่ ยาก +ยาว ยาวนาน ยิ่ง ยิ่งกว่า ยิ่งขึ้น ยิ่งขึ้นไป ยิ่งจน ยิ่งจะ ยิ่งนัก ยิ่งเมื่อ ยิ่งแล้ว ยิ่งใหญ่ ร่วมกัน รวมด้วย ร่วมด้วย รือว่า เร็ว เร็วๆ เราๆ เรียก เรียบ เรื่อย +เรื่อยๆ ไร ล้วน ล้วนจน ล้วนแต่ ละ ล่าสุด เล็ก เล็กน้อย เล็กๆ เล่าว่า แล้วกัน แล้วแต่ แล้วเสร็จ วันใด วันนั้น วันนี้ วันไหน สบาย สมัย สมัยก่อน +สมัยนั้น สมัยนี้ สมัยโน้น ส่วนเกิน ส่วนด้อย ส่วนดี ส่วนใด ส่วนที่ ส่วนน้อย ส่วนนั้น ส่วนมาก ส่วนใหญ่ สั้น สั้นๆ สามารถ สำคัญ สิ่ง +สิ่งใด สิ่งนั้น สิ่งนี้ สิ่งไหน สิ้น เสร็จแล้ว เสียด้วย เสียแล้ว แสดง แสดงว่า หน หนอ หนอย หน่อย หมด หมดกัน หมดสิ้น หรือไง หรือเปล่า หรือไม่ หรือยัง +หรือไร หากแม้ หากแม้น หากแม้นว่า หากว่า หาความ หาใช่ หารือ เหตุ เหตุผล เหตุนั้น เหตุนี้ เหตุไร เห็นแก่ เห็นควร เห็นจะ เห็นว่า เหลือ เหลือเกิน เหล่า +เหล่านั้น เหล่านี้ แห่งใด แห่งนั้น แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น อย่างนี้ +อย่างโน้น อย่างมาก อย่างยิ่ง อย่างไร อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างไหน อย่างๆ อัน อันจะ อันใด อันได้แก่ อันที่ +อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันไหน อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ +""".split()) \ No newline at end of file diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py new file mode 100644 index 000000000..570871820 --- /dev/null +++ b/spacy/lang/th/tag_map.py @@ -0,0 +1,82 @@ +# encoding: utf8 +# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) +from __future__ import unicode_literals + +from ...symbols import * + +TAG_MAP = { + #NOUN + "NOUN": {POS: NOUN}, + "NCMN": {POS: NOUN}, + "NTTL": {POS: NOUN}, + "CNIT": {POS: NOUN}, + "CLTV": {POS: NOUN}, + "CMTR": {POS: NOUN}, + "CFQC": {POS: NOUN}, + "CVBL": {POS: NOUN}, + #PRON + "PRON": {POS: PRON}, + "NPRP": {POS: PRON}, + # ADJ + "ADJ": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, + # ADV + "ADV": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, + # INT + "INT": {POS: INTJ}, + # PRON + "PROPN": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, + # DET + "DET": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, + # NUM + "NUM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, + # AUX + "AUX": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, + # ADP + "ADP": {POS: ADP}, + "RPRE": {POS: ADP}, + # CCONJ + "CCONJ": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, + # SCONJ + "SCONJ": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, + # PART + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, + # PUNCT + "PUNCT": {POS: PUNCT}, + "PUNC": {POS: PUNCT}, + "_SP": {POS: SPACE} +} diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py new file mode 100644 index 000000000..c31595893 --- /dev/null +++ b/spacy/lang/th/tokenizer_exceptions.py @@ -0,0 +1,43 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import * + +TOKENIZER_EXCEPTIONS = { + "ม.ค.": [ + {ORTH: "ม.ค.", LEMMA: "มกราคม"} + ], + "ก.พ.": [ + {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} + ], + "มี.ค.": [ + {ORTH: "มี.ค.", LEMMA: "มีนาคม"} + ], + "เม.ย.": [ + {ORTH: "เม.ย.", LEMMA: "เมษายน"} + ], + "พ.ค.": [ + {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"} + ], + "มิ.ย.": [ + {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"} + ], + "ก.ค.": [ + {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"} + ], + "ส.ค.": [ + {ORTH: "ส.ค.", LEMMA: "สิงหาคม"} + ], + "ก.ย.": [ + {ORTH: "ก.ย.", LEMMA: "กันยายน"} + ], + "ต.ค.": [ + {ORTH: "ต.ค.", LEMMA: "ตุลาคม"} + ], + "พ.ย.": [ + {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"} + ], + "ธ.ค.": [ + {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} + ] +} diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 6a7c09a44..73ad88d08 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -36,11 +36,11 @@ URL_PATTERN = ( r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name - r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" + r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # domain name - r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" + r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*" # TLD identifier - r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" + r"(?:\.(?:[a-z]{2,}))" r")" # port number r"(?::\d{2,5})?" diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index dc63ee33f..017f55ecc 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 3f68336f8..6246fa7ea 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -10,12 +10,12 @@ class Chinese(Language): def make_doc(self, text): try: - from jieba + import jieba except ImportError: raise ImportError("The Chinese tokenizer requires the Jieba library: " "https://github.com/fxsjy/jieba") - words = list(jieba.cut(text, cut_all=True)) - words=[x for x in words if x] + words = list(jieba.cut(text, cut_all=False)) + words = [x for x in words if x] return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py new file mode 100644 index 000000000..5e8a36119 --- /dev/null +++ b/spacy/lang/zh/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.zh.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "蘋果公司正考量用一億元買下英國的新創公司", + "自駕車將保險責任歸屬轉移至製造商", + "舊金山考慮禁止送貨機器人在人行道上行駛", + "倫敦是英國的大城市" +] diff --git a/spacy/language.py b/spacy/language.py index 66b42ff94..1ce74b265 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,52 +1,48 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals -from contextlib import contextmanager -import dill -import numpy -from thinc.neural import Model -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.optimizers import Adam, SGD import random import ujson -from collections import OrderedDict import itertools +import weakref +import functools +from collections import OrderedDict +from contextlib import contextmanager +from copy import copy +from thinc.neural import Model +from thinc.neural.optimizers import Adam from .tokenizer import Tokenizer from .vocab import Vocab -from .tagger import Tagger from .lemmatizer import Lemmatizer -from .syntax.parser import get_templates -from .syntax import nonproj - -from .pipeline import NeuralDependencyParser, EntityRecognizer -from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer -from .pipeline import NeuralLabeller -from .pipeline import SimilarityHook -from .pipeline import TextCategorizer -from . import about - +from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer +from .pipeline import SimilarityHook, TextCategorizer from .compat import json_dumps, izip +from .scorer import Scorer +from ._ml import link_vectors_to_models from .attrs import IS_STOP -from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES +from .lang.punctuation import TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP -from .lang.lex_attrs import LEX_ATTRS +from .lang.lex_attrs import LEX_ATTRS, is_stop from . import util -from .scorer import Scorer +from . import about class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None): - return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) + return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules, + cls.lemma_lookup) @classmethod def create_vocab(cls, nlp=None): lemmatizer = cls.create_lemmatizer(nlp) lex_attr_getters = dict(cls.lex_attr_getters) # This is messy, but it's the minimal working fix to Issue #639. - lex_attr_getters[IS_STOP] = lambda string: string.lower() in cls.stop_words + lex_attr_getters[IS_STOP] = functools.partial(is_stop, + stops=cls.stop_words) vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map, lemmatizer=lemmatizer) for tag_str, exc in cls.morph_rules.items(): @@ -58,83 +54,31 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match - prefix_search = util.compile_prefix_regex(cls.prefixes).search \ - if cls.prefixes else None - suffix_search = util.compile_suffix_regex(cls.suffixes).search \ - if cls.suffixes else None - infix_finditer = util.compile_infix_regex(cls.infixes).finditer \ - if cls.infixes else None + prefix_search = (util.compile_prefix_regex(cls.prefixes).search + if cls.prefixes else None) + suffix_search = (util.compile_suffix_regex(cls.suffixes).search + if cls.suffixes else None) + infix_finditer = (util.compile_infix_regex(cls.infixes).finditer + if cls.infixes else None) vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) return Tokenizer(vocab, rules=rules, - prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer, token_match=token_match) - - @classmethod - def create_tagger(cls, nlp=None, **cfg): - if nlp is None: - return NeuralTagger(cls.create_vocab(nlp), **cfg) - else: - return NeuralTagger(nlp.vocab, **cfg) - - @classmethod - def create_parser(cls, nlp=None, **cfg): - if nlp is None: - return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) - else: - return NeuralDependencyParser(nlp.vocab, **cfg) - - @classmethod - def create_entity(cls, nlp=None, **cfg): - if nlp is None: - return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) - else: - return NeuralEntityRecognizer(nlp.vocab, **cfg) - - @classmethod - def create_pipeline(cls, nlp=None, disable=tuple()): - meta = nlp.meta if nlp is not None else {} - # Resolve strings, like "cnn", "lstm", etc - pipeline = [] - for entry in meta.get('pipeline', []): - if entry in disable or getattr(entry, 'name', entry) in disable: - continue - factory = cls.Defaults.factories[entry] - pipeline.append(factory(nlp, **meta.get(entry, {}))) - return pipeline - - factories = { - 'make_doc': create_tokenizer, - 'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], - 'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], - 'parser': lambda nlp, **cfg: [ - NeuralDependencyParser(nlp.vocab, **cfg), - nonproj.deprojectivize], - 'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], - 'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], - 'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], - # Temporary compatibility -- delete after pivot - 'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], - 'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], - 'dependencies': lambda nlp, **cfg: [ - NeuralDependencyParser(nlp.vocab, **cfg), - nonproj.deprojectivize, - ], - 'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], - } + prefix_search=prefix_search, + suffix_search=suffix_search, + infix_finditer=infix_finditer, + token_match=token_match) + pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] token_match = TOKEN_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) tokenizer_exceptions = {} - parser_features = get_templates('parser') - entity_features = get_templates('ner') - tagger_features = Tagger.feature_templates # TODO -- fix this stop_words = set() lemma_rules = {} lemma_exc = {} lemma_index = {} + lemma_lookup = {} morph_rules = {} lex_attr_getters = LEX_ATTRS syntax_iterators = {} @@ -151,8 +95,17 @@ class Language(object): Defaults = BaseDefaults lang = None - def __init__(self, vocab=True, make_doc=True, pipeline=None, - meta={}, disable=tuple(), **kwargs): + factories = { + 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), + 'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg), + 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), + 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), + 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), + 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), + 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) + } + + def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): """Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via @@ -170,6 +123,7 @@ class Language(object): RETURNS (Language): The newly constructed object. """ self._meta = dict(meta) + self._path = None if vocab is True: factory = self.Defaults.create_vocab vocab = factory(self, **meta.get('vocab', {})) @@ -178,34 +132,21 @@ class Language(object): factory = self.Defaults.create_tokenizer make_doc = factory(self, **meta.get('tokenizer', {})) self.tokenizer = make_doc - if pipeline is True: - self.pipeline = self.Defaults.create_pipeline(self, disable) - elif pipeline: - # Careful not to do getattr(p, 'name', None) here - # If we had disable=[None], we'd disable everything! - self.pipeline = [p for p in pipeline - if p not in disable - and getattr(p, 'name', p) not in disable] - # Resolve strings, like "cnn", "lstm", etc - for i, entry in enumerate(self.pipeline): - if entry in self.Defaults.factories: - factory = self.Defaults.factories[entry] - self.pipeline[i] = factory(self, **meta.get(entry, {})) - else: - self.pipeline = [] - flat_list = [] - for pipe in self.pipeline: - if isinstance(pipe, list): - flat_list.extend(pipe) - else: - flat_list.append(pipe) - self.pipeline = flat_list + self.pipeline = [] self._optimizer = None + def __reduce__(self): + bytes_data = self.to_bytes(vocab=False) + return (unpickle_language, (self.vocab, self.meta, bytes_data)) + + @property + def path(self): + return self._path + @property def meta(self): self._meta.setdefault('lang', self.vocab.lang) - self._meta.setdefault('name', '') + self._meta.setdefault('name', 'model') self._meta.setdefault('version', '0.0.0') self._meta.setdefault('spacy_version', about.__version__) self._meta.setdefault('description', '') @@ -213,11 +154,9 @@ class Language(object): self._meta.setdefault('email', '') self._meta.setdefault('url', '') self._meta.setdefault('license', '') - pipeline = [] - for component in self.pipeline: - if hasattr(component, 'name'): - pipeline.append(component.name) - self._meta['pipeline'] = pipeline + self._meta['vectors'] = {'width': self.vocab.vectors_length, + 'entries': len(self.vocab.vectors)} + self._meta['pipeline'] = self.pipe_names return self._meta @meta.setter @@ -227,34 +166,154 @@ class Language(object): # Conveniences to access pipeline components @property def tensorizer(self): - return self.get_component('tensorizer') + return self.get_pipe('tensorizer') @property def tagger(self): - return self.get_component('tagger') + return self.get_pipe('tagger') @property def parser(self): - return self.get_component('parser') + return self.get_pipe('parser') @property def entity(self): - return self.get_component('ner') + return self.get_pipe('ner') @property def matcher(self): - return self.get_component('matcher') + return self.get_pipe('matcher') - def get_component(self, name): - if self.pipeline in (True, None): - return None - for proc in self.pipeline: - if hasattr(proc, 'name') and proc.name.endswith(name): - return proc - return None + @property + def pipe_names(self): + """Get names of available pipeline components. + + RETURNS (list): List of component name strings, in order. + """ + return [pipe_name for pipe_name, _ in self.pipeline] + + def get_pipe(self, name): + """Get a pipeline component for a given component name. + + name (unicode): Name of pipeline component to get. + RETURNS (callable): The pipeline component. + """ + for pipe_name, component in self.pipeline: + if pipe_name == name: + return component + msg = "No component '{}' found in pipeline. Available names: {}" + raise KeyError(msg.format(name, self.pipe_names)) + + def create_pipe(self, name, config=dict()): + """Create a pipeline component from a factory. + + name (unicode): Factory name to look up in `Language.factories`. + config (dict): Configuration parameters to initialise component. + RETURNS (callable): Pipeline component. + """ + if name not in self.factories: + raise KeyError("Can't find factory for '{}'.".format(name)) + factory = self.factories[name] + return factory(self, **config) + + def add_pipe(self, component, name=None, before=None, after=None, + first=None, last=None): + """Add a component to the processing pipeline. Valid components are + callables that take a `Doc` object, modify it and return it. Only one + of before/after/first/last can be set. Default behaviour is "last". + + component (callable): The pipeline component. + name (unicode): Name of pipeline component. Overwrites existing + component.name attribute if available. If no name is set and + the component exposes no name attribute, component.__name__ is + used. An error is raised if a name already exists in the pipeline. + before (unicode): Component name to insert component directly before. + after (unicode): Component name to insert component directly after. + first (bool): Insert component first / not first in the pipeline. + last (bool): Insert component last / not last in the pipeline. + + EXAMPLE: + >>> nlp.add_pipe(component, before='ner') + >>> nlp.add_pipe(component, name='custom_name', last=True) + """ + if name is None: + if hasattr(component, 'name'): + name = component.name + elif hasattr(component, '__name__'): + name = component.__name__ + elif (hasattr(component, '__class__') and + hasattr(component.__class__, '__name__')): + name = component.__class__.__name__ + else: + name = repr(component) + if name in self.pipe_names: + raise ValueError("'{}' already exists in pipeline.".format(name)) + if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: + msg = ("Invalid constraints. You can only set one of the " + "following: before, after, first, last.") + raise ValueError(msg) + pipe = (name, component) + if last or not any([first, before, after]): + self.pipeline.append(pipe) + elif first: + self.pipeline.insert(0, pipe) + elif before and before in self.pipe_names: + self.pipeline.insert(self.pipe_names.index(before), pipe) + elif after and after in self.pipe_names: + self.pipeline.insert(self.pipe_names.index(after), pipe) + else: + msg = "Can't find '{}' in pipeline. Available names: {}" + unfound = before or after + raise ValueError(msg.format(unfound, self.pipe_names)) + + def has_pipe(self, name): + """Check if a component name is present in the pipeline. Equivalent to + `name in nlp.pipe_names`. + + name (unicode): Name of the component. + RETURNS (bool): Whether a component of the name exists in the pipeline. + """ + return name in self.pipe_names + + def replace_pipe(self, name, component): + """Replace a component in the pipeline. + + name (unicode): Name of the component to replace. + component (callable): Pipeline component. + """ + if name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(name, self.pipe_names)) + self.pipeline[self.pipe_names.index(name)] = (name, component) + + def rename_pipe(self, old_name, new_name): + """Rename a pipeline component. + + old_name (unicode): Name of the component to rename. + new_name (unicode): New name of the component. + """ + if old_name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(old_name, self.pipe_names)) + if new_name in self.pipe_names: + msg = "'{}' already exists in pipeline. Existing names: {}" + raise ValueError(msg.format(new_name, self.pipe_names)) + i = self.pipe_names.index(old_name) + self.pipeline[i] = (new_name, self.pipeline[i][1]) + + def remove_pipe(self, name): + """Remove a component from the pipeline. + + name (unicode): Name of the component to remove. + RETURNS (tuple): A `(name, component)` tuple of the removed component. + """ + if name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(name, self.pipe_names)) + return self.pipeline.pop(self.pipe_names.index(name)) def __call__(self, text, disable=[]): - """'Apply the pipeline to some text. The text can span multiple sentences, + """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. @@ -268,18 +327,36 @@ class Language(object): ('An', 'NN') """ doc = self.make_doc(text) - for proc in self.pipeline: - name = getattr(proc, 'name', None) + for name, proc in self.pipeline: if name in disable: continue doc = proc(doc) return doc + def disable_pipes(self, *names): + """Disable one or more pipeline components. If used as a context + manager, the pipeline will be restored to the initial state at the end + of the block. Otherwise, a DisabledPipes object is returned, that has + a `.restore()` method you can use to undo your changes. + + EXAMPLE: + >>> nlp.add_pipe('parser') + >>> nlp.add_pipe('tagger') + >>> with nlp.disable_pipes('parser', 'tagger'): + >>> assert not nlp.has_pipe('parser') + >>> assert nlp.has_pipe('parser') + >>> disabled = nlp.disable_pipes('parser') + >>> assert len(disabled) == 1 + >>> assert not nlp.has_pipe('parser') + >>> disabled.restore() + >>> assert nlp.has_pipe('parser') + """ + return DisabledPipes(self, *names) + def make_doc(self, text): return self.tokenizer(text) - def update(self, docs, golds, drop=0., sgd=None, losses=None, - update_shared=False): + def update(self, docs, golds, drop=0., sgd=None, losses=None): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -289,46 +366,33 @@ class Language(object): RETURNS (dict): Results from the update. EXAMPLE: - >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): + >>> with nlp.begin_training(gold) as (trainer, optimizer): >>> for epoch in trainer.epochs(gold): >>> for docs, golds in epoch: >>> state = nlp.update(docs, golds, sgd=optimizer) """ if len(docs) != len(golds): raise IndexError("Update expects same number of docs and golds " - "Got: %d, %d" % (len(docs), len(golds))) + "Got: %d, %d" % (len(docs), len(golds))) if len(docs) == 0: return if sgd is None: if self._optimizer is None: self._optimizer = Adam(Model.ops, 0.001) sgd = self._optimizer - tok2vec = self.pipeline[0] - feats = tok2vec.doc2feats(docs) grads = {} + def get_grads(W, dW, key=None): grads[key] = (W, dW) - pipes = list(self.pipeline[1:]) + + pipes = list(self.pipeline) random.shuffle(pipes) - tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) - all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] - for proc in pipes: + for name, proc in pipes: if not hasattr(proc, 'update'): continue - d_tokvecses = proc.update((docs, tokvecses), golds, - drop=drop, sgd=get_grads, losses=losses) - if update_shared and d_tokvecses is not None: - for i, d_tv in enumerate(d_tokvecses): - all_d_tokvecses[i] += d_tv - if update_shared and bp_tokvecses is not None: - bp_tokvecses(all_d_tokvecses, sgd=sgd) + proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) - # Clear the tensor variable, to free GPU memory. - # If we don't do this, the memory leak gets pretty - # bad, because we may be holding part of a batch. - for doc in docs: - doc.tensor = None def preprocess_gold(self, docs_golds): """Can be called before training to pre-process gold data. By default, @@ -337,43 +401,56 @@ class Language(object): docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. """ - for proc in self.pipeline: + for name, proc in self.pipeline: if hasattr(proc, 'preprocess_gold'): docs_golds = proc.preprocess_gold(docs_golds) for doc, gold in docs_golds: yield doc, gold - def begin_training(self, get_gold_tuples, **cfg): + def resume_training(self, **cfg): + if cfg.get('device', -1) >= 0: + device = util.use_gpu(cfg['device']) + if self.vocab.vectors.data.shape[1] >= 1: + self.vocab.vectors.data = Model.ops.asarray( + self.vocab.vectors.data) + else: + device = None + learn_rate = util.env_opt('learn_rate', 0.001) + beta1 = util.env_opt('optimizer_B1', 0.9) + beta2 = util.env_opt('optimizer_B2', 0.999) + eps = util.env_opt('optimizer_eps', 1e-08) + L2 = util.env_opt('L2_penalty', 1e-6) + max_grad_norm = util.env_opt('grad_norm_clip', 1.) + self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps) + self._optimizer.max_grad_norm = max_grad_norm + self._optimizer.device = device + return self._optimizer + + def begin_training(self, get_gold_tuples=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. - gold_tuples (iterable): Gold-standard training data. + get_gold_tuples (function): Function returning gold data **cfg: Config parameters. - YIELDS (tuple): A trainer and an optimizer. - - EXAMPLE: - >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): - >>> for epoch in trainer.epochs(gold): - >>> for docs, golds in epoch: - >>> state = nlp.update(docs, golds, sgd=optimizer) + RETURNS: An optimizer """ - if self.parser: - self.pipeline.append(NeuralLabeller(self.vocab)) # Populate vocab - for _, annots_brackets in get_gold_tuples(): - for annots, _ in annots_brackets: - for word in annots[1]: - _ = self.vocab[word] + if get_gold_tuples is not None: + for _, annots_brackets in get_gold_tuples(): + for annots, _ in annots_brackets: + for word in annots[1]: + _ = self.vocab[word] contexts = [] if cfg.get('device', -1) >= 0: - import cupy.cuda.device - device = cupy.cuda.device.Device(cfg['device']) - device.use() - Model.ops = CupyOps() - Model.Ops = CupyOps + device = util.use_gpu(cfg['device']) + if self.vocab.vectors.data.shape[1] >= 1: + self.vocab.vectors.data = Model.ops.asarray( + self.vocab.vectors.data) else: device = None - for proc in self.pipeline: + link_vectors_to_models(self.vocab) + for name, proc in self.pipeline: if hasattr(proc, 'begin_training'): context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) @@ -385,26 +462,25 @@ class Language(object): L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps) + beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer - def evaluate(self, docs_golds): + def evaluate(self, docs_golds, verbose=False): scorer = Scorer() docs, golds = zip(*docs_golds) docs = list(docs) golds = list(golds) - for pipe in self.pipeline: + for name, pipe in self.pipeline: if not hasattr(pipe, 'pipe'): - for doc in docs: - pipe(doc) + docs = (pipe(doc) for doc in docs) else: - docs = list(pipe.pipe(docs)) - assert len(docs) == len(golds) + docs = pipe.pipe(docs, batch_size=256) for doc, gold in zip(docs, golds): - scorer.score(doc, gold) - doc.tensor = None + if verbose: + print(doc) + scorer.score(doc, gold, verbose=verbose) return scorer @contextmanager @@ -420,7 +496,7 @@ class Language(object): >>> with nlp.use_params(optimizer.averages): >>> nlp.to_disk('/tmp/checkpoint') """ - contexts = [pipe.use_params(params) for pipe + contexts = [pipe.use_params(params) for name, pipe in self.pipeline if hasattr(pipe, 'use_params')] # TODO: Having trouble with contextlib # Workaround: these aren't actually context managers atm. @@ -437,17 +513,17 @@ class Language(object): pass def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, - disable=[]): - """Process texts as a stream, and yield `Doc` objects in order. Supports - GIL-free multi-threading. + disable=[]): + """Process texts as a stream, and yield `Doc` objects in order. + Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. - n_threads (int): The number of worker threads to use. If -1, OpenMP will - decide how many to use at run time. Default is 2. + n_threads (int): The number of worker threads to use. If -1, OpenMP + will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. disable (list): Names of the pipeline components to disable. YIELDS (Doc): Documents in the order of the original text. @@ -467,24 +543,49 @@ class Language(object): yield (doc, context) return docs = (self.make_doc(text) for text in texts) - for proc in self.pipeline: - name = getattr(proc, 'name', None) + for name, proc in self.pipeline: if name in disable: continue if hasattr(proc, 'pipe'): - docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) + docs = proc.pipe(docs, n_threads=n_threads, + batch_size=batch_size) else: # Apply the function, but yield the doc docs = _pipe(proc, docs) + # Track weakrefs of "recent" documents, so that we can see when they + # expire from memory. When they do, we know we don't need old strings. + # This way, we avoid maintaining an unbounded growth in string entries + # in the string store. + recent_refs = weakref.WeakSet() + old_refs = weakref.WeakSet() + original_strings_data = self.vocab.strings.to_bytes() + StringStore = self.vocab.strings.__class__ + recent_strings = StringStore().from_bytes(original_strings_data) + nr_seen = 0 for doc in docs: yield doc + for word in doc: + recent_strings.add(word.text) + recent_refs.add(doc) + if nr_seen < 10000: + old_refs.add(doc) + nr_seen += 1 + elif len(old_refs) == 0: + # All the docs in the 'old' set have expired, so the only + # difference between the backup strings and the current + # string-store should be obsolete. We therefore swap out the + # old strings data. + old_refs, recent_refs = recent_refs, old_refs + self.vocab.strings._reset_and_load(recent_strings) + recent_strings = StringStore().from_bytes(original_strings_data) + nr_seen = 0 def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this will include the model. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be strings or `Path`-like objects. disable (list): Names of pipeline components to disable and prevent from being saved. @@ -493,18 +594,18 @@ class Language(object): """ path = util.ensure_path(path) serializers = OrderedDict(( - ('vocab', lambda p: self.vocab.to_disk(p)), ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) - for proc in self.pipeline: + for name, proc in self.pipeline: if not hasattr(proc, 'name'): continue - if proc.name in disable: + if name in disable: continue if not hasattr(proc, 'to_disk'): continue - serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) + serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) + serializers['vocab'] = lambda p: self.vocab.to_disk(p) util.to_disk(path, serializers, {p: False for p in disable}) def from_disk(self, path, disable=tuple()): @@ -525,23 +626,22 @@ class Language(object): deserializers = OrderedDict(( ('vocab', lambda p: self.vocab.from_disk(p)), ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), - ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) + ('meta.json', lambda p: self.meta.update(ujson.load(p.open('r')))) )) - for proc in self.pipeline: - if not hasattr(proc, 'name'): - continue - if proc.name in disable: + for name, proc in self.pipeline: + if name in disable: continue if not hasattr(proc, 'to_disk'): continue - deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) + deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) exclude = {p: False for p in disable} if not (path / 'vocab').exists(): exclude['vocab'] = True util.from_disk(path, deserializers, exclude) + self._path = path return self - def to_bytes(self, disable=[]): + def to_bytes(self, disable=[], **exclude): """Serialize the current state to a binary string. disable (list): Nameds of pipeline components to disable and prevent @@ -551,15 +651,15 @@ class Language(object): serializers = OrderedDict(( ('vocab', lambda: self.vocab.to_bytes()), ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), - ('meta', lambda: ujson.dumps(self.meta)) + ('meta', lambda: json_dumps(self.meta)) )) - for i, proc in enumerate(self.pipeline): - if getattr(proc, 'name', None) in disable: + for i, (name, proc) in enumerate(self.pipeline): + if name in disable: continue if not hasattr(proc, 'to_bytes'): continue serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False) - return util.to_bytes(serializers, {}) + return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, disable=[]): """Load state from a binary string. @@ -573,8 +673,8 @@ class Language(object): ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), ('meta', lambda b: self.meta.update(ujson.loads(b))) )) - for i, proc in enumerate(self.pipeline): - if getattr(proc, 'name', None) in disable: + for i, (name, proc) in enumerate(self.pipeline): + if name in disable: continue if not hasattr(proc, 'from_bytes'): continue @@ -583,6 +683,49 @@ class Language(object): return self +class DisabledPipes(list): + """Manager for temporary pipeline disabling.""" + def __init__(self, nlp, *names): + self.nlp = nlp + self.names = names + # Important! Not deep copy -- we just want the container (but we also + # want to support people providing arbitrarily typed nlp.pipeline + # objects.) + self.original_pipeline = copy(nlp.pipeline) + list.__init__(self) + self.extend(nlp.remove_pipe(name) for name in names) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.restore() + + def restore(self): + '''Restore the pipeline to its state when DisabledPipes was created.''' + current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline + unexpected = [name for name, pipe in current + if not self.nlp.has_pipe(name)] + if unexpected: + # Don't change the pipeline if we're raising an error. + self.nlp.pipeline = current + msg = ( + "Some current components would be lost when restoring " + "previous pipeline state. If you added components after " + "calling nlp.disable_pipes(), you should remove them " + "explicitly with nlp.remove_pipe() before the pipeline is " + "restore. Names of the new components: %s" + ) + raise ValueError(msg % unexpected) + self[:] = [] + + +def unpickle_language(vocab, meta, bytes_data): + lang = Language(vocab=vocab) + lang.from_bytes(bytes_data) + return lang + + def _pipe(func, docs): for doc in docs: func(doc) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 3a04a471d..40cd995e2 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -7,27 +7,29 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class Lemmatizer(object): @classmethod - def load(cls, path, index=None, exc=None, rules=None): - return cls(index or {}, exc or {}, rules or {}) + def load(cls, path, index=None, exc=None, rules=None, lookup=None): + return cls(index or {}, exc or {}, rules or {}, lookup or {}) - def __init__(self, index, exceptions, rules): - self.index = index - self.exc = exceptions - self.rules = rules + def __init__(self, index=None, exceptions=None, rules=None, lookup=None): + self.index = index if index is not None else {} + self.exc = exceptions if exceptions is not None else {} + self.rules = rules if rules is not None else {} + self.lookup_table = lookup if lookup is not None else {} def __call__(self, string, univ_pos, morphology=None): - if univ_pos == NOUN: + if univ_pos in (NOUN, 'NOUN', 'noun'): univ_pos = 'noun' - elif univ_pos == VERB: + elif univ_pos in (VERB, 'VERB', 'verb'): univ_pos = 'verb' - elif univ_pos == ADJ: + elif univ_pos in (ADJ, 'ADJ', 'adj'): univ_pos = 'adj' - elif univ_pos == PUNCT: + elif univ_pos in (PUNCT, 'PUNCT', 'punct'): univ_pos = 'punct' + else: + return list(set([string.lower()])) # See Issue #435 for example of where this logic is requied. - print("Check base form", string) if self.is_base_form(univ_pos, morphology): - return set([string.lower()]) + return list(set([string.lower()])) lemmas = lemmatize(string, self.index.get(univ_pos, {}), self.exc.get(univ_pos, {}), self.rules.get(univ_pos, [])) @@ -41,16 +43,15 @@ class Lemmatizer(object): morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')] - true_morph_key = morphology.get('morph', 0) if univ_pos == 'noun' and morphology.get('Number') == 'sing': return True elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': return True # This maps 'VBP' to base form -- probably just need 'IS_BASE' # morphology - elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ - morphology.get('Tense') == 'pres' and \ - morphology.get('Number') is None and \ + elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and + morphology.get('Tense') == 'pres' and + morphology.get('Number') is None and not others): return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': @@ -78,26 +79,29 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, 'punct', morphology) + def lookup(self, string): + if string in self.lookup_table: + return self.lookup_table[string] + return string + def lemmatize(string, index, exceptions, rules): string = string.lower() forms = [] - # TODO: Is this correct? See discussion in Issue #435. - #if string in index: - # forms.append(string) forms.extend(exceptions.get(string, [])) oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: forms.append(string) - return set(forms) + return list(set(forms)) diff --git a/spacy/lemmatizerlookup.py b/spacy/lemmatizerlookup.py deleted file mode 100644 index 0c0c693c1..000000000 --- a/spacy/lemmatizerlookup.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .lemmatizer import Lemmatizer - - -class Lemmatizer(Lemmatizer): - @classmethod - def load(cls, path, lookup): - return cls(lookup or {}) - - def __init__(self, lookup): - self.lookup = lookup - - def __call__(self, string, univ_pos, morphology=None): - try: - return set([self.lookup[string]]) - except: - return set([string]) \ No newline at end of file diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0f5c6398..10c934ba4 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -2,27 +2,19 @@ # coding: utf8 from __future__ import unicode_literals, print_function -from libc.math cimport sqrt -from cpython.ref cimport Py_INCREF -from cymem.cymem cimport Pool -from murmurhash.mrmr cimport hash64 - # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np np.import_array() - from libc.string cimport memset import numpy from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from .attrs cimport IS_BRACKET -from .attrs cimport IS_QUOTE -from .attrs cimport IS_LEFT_PUNCT -from .attrs cimport IS_RIGHT_PUNCT -from .attrs cimport IS_OOV +from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV +from .attrs cimport PROB +from .attrs import intify_attrs from . import about @@ -32,8 +24,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) cdef class Lexeme: """An entry in the vocabulary. A `Lexeme` has no string context – it's a word-type, as opposed to a word token. It therefore has no part-of-speech - tag, dependency parse, or lemma (lemmatization depends on the part-of-speech - tag). + tag, dependency parse, or lemma (lemmatization depends on the + part-of-speech tag). """ def __init__(self, Vocab vocab, attr_t orth): """Create a Lexeme object. @@ -60,17 +52,17 @@ cdef class Lexeme: else: a = 0 b = 1 - if op == 2: # == + if op == 2: # == return a == b - elif op == 3: # != + elif op == 3: # != return a != b - elif op == 0: # < + elif op == 0: # < return a < b - elif op == 1: # <= + elif op == 1: # <= return a <= b - elif op == 4: # > + elif op == 4: # > return a > b - elif op == 5: # >= + elif op == 5: # >= return a >= b else: raise NotImplementedError(op) @@ -78,6 +70,19 @@ cdef class Lexeme: def __hash__(self): return self.c.orth + def set_attrs(self, **attrs): + cdef attr_id_t attr + attrs = intify_attrs(attrs) + for attr, value in attrs.items(): + if attr == PROB: + self.c.prob = value + elif attr == CLUSTER: + self.c.cluster = int(value) + elif isinstance(value, int) or isinstance(value, long): + Lexeme.set_struct_attr(self.c, attr, value) + else: + Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value)) + def set_flag(self, attr_id_t flag_id, bint value): """Change the value of a boolean flag. @@ -104,7 +109,8 @@ cdef class Lexeme: """ if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 - return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + return (numpy.dot(self.vector, other.vector) / + (self.vector_norm * other.vector_norm)) def to_bytes(self): lex_data = Lexeme.c_to_bytes(self.c) @@ -130,19 +136,13 @@ cdef class Lexeme: self.orth = self.c.orth property has_vector: - """A boolean value indicating whether a word vector is associated with - the object. - - RETURNS (bool): Whether a word vector is associated with the object. + """RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): return self.vocab.has_vector(self.c.orth) property vector_norm: - """The L2 norm of the lexeme's vector representation. - - RETURNS (float): The L2 norm of the vector representation. - """ + """RETURNS (float): The L2 norm of the vector representation.""" def __get__(self): vector = self.vector return numpy.sqrt((vector**2).sum()) @@ -169,149 +169,320 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: + """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used + to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id + def __set__(self, value): self.c.id = value property sentiment: + """RETURNS (float): A scalar value indicating the positivity or + negativity of the lexeme.""" def __get__(self): return self.c.sentiment + def __set__(self, float sentiment): self.c.sentiment = sentiment property orth_: + """RETURNS (unicode): The original verbatim text of the lexeme + (identical to `Lexeme.text`). Exists mostly for consistency with + the other attributes.""" def __get__(self): return self.vocab.strings[self.c.orth] property text: - """A unicode representation of the token text. - - RETURNS (unicode): The original verbatim text of the token. - """ + """RETURNS (unicode): The original verbatim text of the lexeme.""" def __get__(self): return self.orth_ property lower: - def __get__(self): return self.c.lower - def __set__(self, attr_t x): self.c.lower = x + """RETURNS (unicode): Lowercase form of the lexeme.""" + def __get__(self): + return self.c.lower + + def __set__(self, attr_t x): + self.c.lower = x property norm: - def __get__(self): return self.c.norm - def __set__(self, attr_t x): self.c.norm = x + """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the + lexeme text. + """ + def __get__(self): + return self.c.norm + + def __set__(self, attr_t x): + self.c.norm = x property shape: - def __get__(self): return self.c.shape - def __set__(self, attr_t x): self.c.shape = x + """RETURNS (uint64): Transform of the word's string, to show + orthographic features. + """ + def __get__(self): + return self.c.shape + + def __set__(self, attr_t x): + self.c.shape = x property prefix: - def __get__(self): return self.c.prefix - def __set__(self, attr_t x): self.c.prefix = x + """RETURNS (uint64): Length-N substring from the start of the word. + Defaults to `N=1`. + """ + def __get__(self): + return self.c.prefix + + def __set__(self, attr_t x): + self.c.prefix = x property suffix: - def __get__(self): return self.c.suffix - def __set__(self, attr_t x): self.c.suffix = x + """RETURNS (uint64): Length-N substring from the end of the word. + Defaults to `N=3`. + """ + def __get__(self): + return self.c.suffix + + def __set__(self, attr_t x): + self.c.suffix = x property cluster: - def __get__(self): return self.c.cluster - def __set__(self, attr_t x): self.c.cluster = x + """RETURNS (int): Brown cluster ID.""" + def __get__(self): + return self.c.cluster + + def __set__(self, attr_t x): + self.c.cluster = x property lang: - def __get__(self): return self.c.lang - def __set__(self, attr_t x): self.c.lang = x + """RETURNS (uint64): Language of the parent vocabulary.""" + def __get__(self): + return self.c.lang + + def __set__(self, attr_t x): + self.c.lang = x property prob: - def __get__(self): return self.c.prob - def __set__(self, float x): self.c.prob = x + """RETURNS (float): Smoothed log probability estimate of the lexeme's + type.""" + def __get__(self): + return self.c.prob + + def __set__(self, float x): + self.c.prob = x property lower_: - def __get__(self): return self.vocab.strings[self.c.lower] - def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x) + """RETURNS (unicode): Lowercase form of the word.""" + def __get__(self): + return self.vocab.strings[self.c.lower] + + def __set__(self, unicode x): + self.c.lower = self.vocab.strings.add(x) property norm_: - def __get__(self): return self.vocab.strings[self.c.norm] - def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x) + """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the + lexeme text. + """ + def __get__(self): + return self.vocab.strings[self.c.norm] + + def __set__(self, unicode x): + self.c.norm = self.vocab.strings.add(x) property shape_: - def __get__(self): return self.vocab.strings[self.c.shape] - def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x) + """RETURNS (unicode): Transform of the word's string, to show + orthographic features. + """ + def __get__(self): + return self.vocab.strings[self.c.shape] + + def __set__(self, unicode x): + self.c.shape = self.vocab.strings.add(x) property prefix_: - def __get__(self): return self.vocab.strings[self.c.prefix] - def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x) + """RETURNS (unicode): Length-N substring from the start of the word. + Defaults to `N=1`. + """ + def __get__(self): + return self.vocab.strings[self.c.prefix] + + def __set__(self, unicode x): + self.c.prefix = self.vocab.strings.add(x) property suffix_: - def __get__(self): return self.vocab.strings[self.c.suffix] - def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x) + """RETURNS (unicode): Length-N substring from the end of the word. + Defaults to `N=3`. + """ + def __get__(self): + return self.vocab.strings[self.c.suffix] + + def __set__(self, unicode x): + self.c.suffix = self.vocab.strings.add(x) property lang_: - def __get__(self): return self.vocab.strings[self.c.lang] - def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x) + """RETURNS (unicode): Language of the parent vocabulary.""" + def __get__(self): + return self.vocab.strings[self.c.lang] + + def __set__(self, unicode x): + self.c.lang = self.vocab.strings.add(x) property flags: - def __get__(self): return self.c.flags - def __set__(self, flags_t x): self.c.flags = x + """RETURNS (uint64): Container of the lexeme's binary flags.""" + def __get__(self): + return self.c.flags + + def __set__(self, flags_t x): + self.c.flags = x property is_oov: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) - def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x) + """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_OOV) + + def __set__(self, attr_t x): + Lexeme.c_set_flag(self.c, IS_OOV, x) property is_stop: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x) + """RETURNS (bool): Whether the lexeme is a stop word.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_STOP) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_STOP, x) property is_alpha: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) + """RETURNS (bool): Whether the lexeme consists of alphanumeric + characters. Equivalent to `lexeme.text.isalpha()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_ALPHA) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_ALPHA, x) property is_ascii: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) + """RETURNS (bool): Whether the lexeme consists of ASCII characters. + Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_ASCII) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_ASCII, x) property is_digit: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x) + """RETURNS (bool): Whether the lexeme consists of digits. Equivalent + to `lexeme.text.isdigit()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_DIGIT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_DIGIT, x) property is_lower: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x) + """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to + `lexeme.text.islower()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_LOWER) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_LOWER, x) + + property is_upper: + """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to + `lexeme.text.isupper()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_UPPER) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_UPPER, x) property is_title: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x) + """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to + `lexeme.text.istitle()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_TITLE) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_TITLE, x) property is_punct: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) + """RETURNS (bool): Whether the lexeme is punctuation.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_PUNCT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_PUNCT, x) property is_space: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) + """RETURNS (bool): Whether the lexeme consist of whitespace characters. + Equivalent to `lexeme.text.isspace()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_SPACE) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_SPACE, x) property is_bracket: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) + """RETURNS (bool): Whether the lexeme is a bracket.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_BRACKET) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_BRACKET, x) property is_quote: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) + """RETURNS (bool): Whether the lexeme is a quotation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_QUOTE) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_QUOTE, x) property is_left_punct: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) + """RETURNS (bool): Whether the lexeme is left punctuation, e.g. ).""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) property is_right_punct: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + """RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) property like_url: - def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) + """RETURNS (bool): Whether the lexeme resembles a URL.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, LIKE_URL) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) + """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", + "10", "ten", etc. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, LIKE_NUM) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_NUM, x) property like_email: - def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) + """RETURNS (bool): Whether the lexeme resembles an email address.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, LIKE_EMAIL) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index c75d23957..a6b02ba2c 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -4,12 +4,6 @@ from __future__ import unicode_literals import ujson - -from .typedefs cimport attr_t -from .typedefs cimport hash_t -from .attrs cimport attr_id_t -from .structs cimport TokenC - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector @@ -17,14 +11,15 @@ from libcpp.pair cimport pair from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from .attrs cimport ID, ENT_TYPE -from . import attrs -from .tokens.doc cimport get_token_attr -from .tokens.doc cimport Doc +from .typedefs cimport attr_t +from .typedefs cimport hash_t +from .structs cimport TokenC +from .tokens.doc cimport Doc, get_token_attr from .vocab cimport Vocab +from .attrs import IDS +from .attrs cimport attr_id_t, ID, NULL_ATTR from .attrs import FLAG61 as U_ENT - from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT from .attrs import FLAG58 as B4_ENT @@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT from .attrs import FLAG54 as B8_ENT from .attrs import FLAG53 as B9_ENT from .attrs import FLAG52 as B10_ENT - from .attrs import FLAG51 as I3_ENT from .attrs import FLAG50 as I4_ENT from .attrs import FLAG49 as I5_ENT @@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT from .attrs import FLAG46 as I8_ENT from .attrs import FLAG45 as I9_ENT from .attrs import FLAG44 as I10_ENT - from .attrs import FLAG43 as L2_ENT from .attrs import FLAG42 as L3_ENT from .attrs import FLAG41 as L4_ENT @@ -69,8 +62,14 @@ cdef enum action_t: REPEAT ACCEPT ADVANCE_ZERO + ACCEPT_PREV PANIC +# A "match expression" conists of one or more token patterns +# Each token pattern consists of a quantifier and 0+ (attr, value) pairs. +# A state is an (int, pattern pointer) pair, where the int is the start +# position, and the pattern pointer shows where we're up to +# in the pattern. cdef struct AttrValueC: attr_id_t attr @@ -115,22 +114,31 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: + lookahead = &pattern[1] for attr in pattern.attrs[:pattern.nr_attr]: if get_token_attr(token, attr.attr) != attr.value: if pattern.quantifier == ONE: return REJECT elif pattern.quantifier == ZERO: - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE + return ACCEPT if lookahead.nr_attr == 0 else ADVANCE elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS): - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO + return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO else: return PANIC if pattern.quantifier == ZERO: return REJECT + elif lookahead.nr_attr == 0: + return ACCEPT elif pattern.quantifier in (ONE, ZERO_ONE): - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE + return ADVANCE elif pattern.quantifier == ZERO_PLUS: - return REPEAT + # This is a bandaid over the 'shadowing' problem described here: + # https://github.com/explosion/spaCy/issues/864 + next_action = get_action(lookahead, token) + if next_action is REJECT: + return REPEAT + else: + return ADVANCE_ZERO else: return PANIC @@ -138,10 +146,14 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: def _convert_strings(token_specs, string_store): # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,), '1': (ONE,)} + '?': (ZERO_ONE,), '1': (ONE,)} tokens = [] op = ONE for spec in token_specs: + if not spec: + # Signifier for 'any token' + tokens.append((ONE, [(NULL_ATTR, 0)])) + continue token = [] ops = (ONE,) for attr, value in spec.items(): @@ -149,10 +161,10 @@ def _convert_strings(token_specs, string_store): if value in operators: ops = operators[value] else: - raise KeyError( - "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys()))) + msg = "Unknown operator '%s'. Options: %s" + raise KeyError(msg % (value, ', '.join(operators.keys()))) if isinstance(attr, basestring): - attr = attrs.IDS.get(attr.upper()) + attr = IDS.get(attr.upper()) if isinstance(value, basestring): value = string_store.add(value) if isinstance(value, bool): @@ -167,7 +179,7 @@ def _convert_strings(token_specs, string_store): def merge_phrase(matcher, doc, i, matches): """Callback to merge a phrase on match.""" ent_id, label, start, end = matches[i] - span = doc[start : end] + span = doc[start:end] span.merge(ent_type=label, ent_id=ent_id) @@ -179,7 +191,6 @@ cdef class Matcher: cdef public object _patterns cdef public object _entities cdef public object _callbacks - cdef public object _acceptors def __init__(self, vocab): """Create the Matcher. @@ -190,7 +201,6 @@ cdef class Matcher: """ self._patterns = {} self._entities = {} - self._acceptors = {} self._callbacks = {} self.vocab = vocab self.mem = Pool() @@ -213,19 +223,35 @@ cdef class Matcher: key (unicode): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return len(self._patterns) + return self._normalize_key(key) in self._patterns def add(self, key, on_match, *patterns): - """Add a match-rule to the matcher. - A match-rule consists of: an ID key, an on_match callback, and one or - more patterns. If the key exists, the patterns are appended to the - previous ones, and the previous on_match callback is replaced. The - `on_match` callback will receive the arguments `(matcher, doc, i, - matches)`. You can also set `on_match` to `None` to not perform any - actions. A pattern consists of one or more `token_specs`, where a - `token_spec` is a dictionary mapping attribute IDs to values. Token - descriptors can also include quantifiers. There are currently important - known problems with the quantifiers – see the docs. + """Add a match-rule to the matcher. A match-rule consists of: an ID + key, an on_match callback, and one or more patterns. + + If the key exists, the patterns are appended to the previous ones, and + the previous on_match callback is replaced. The `on_match` callback + will receive the arguments `(matcher, doc, i, matches)`. You can also + set `on_match` to `None` to not perform any actions. + + A pattern consists of one or more `token_specs`, where a `token_spec` + is a dictionary mapping attribute IDs to values, and optionally a + quantifier operator under the key "op". The available quantifiers are: + + '!': Negate the pattern, by requiring it to match exactly 0 times. + '?': Make the pattern optional, by allowing it to match 0 or 1 times. + '+': Require the pattern to match 1 or more times. + '*': Allow the pattern to zero or more times. + + The + and * operators are usually interpretted "greedily", i.e. longer + matches are returned where possible. However, if you specify two '+' + and '*' patterns in a row and their matches overlap, the first + operator will behave non-greedily. This quirk in the semantics makes + the matcher more efficient, by avoiding the need for back-tracking. + + key (unicode): The match ID. + on_match (callable): Callback executed on match. + *patterns (list): List of token descritions. """ for pattern in patterns: if len(pattern) == 0: @@ -235,7 +261,6 @@ cdef class Matcher: key = self._normalize_key(key) self._patterns.setdefault(key, []) self._callbacks[key] = on_match - for pattern in patterns: specs = _convert_strings(pattern, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, key, specs)) @@ -282,9 +307,9 @@ cdef class Matcher: """Match a stream of documents, yielding them in turn. docs (iterable): A stream of documents. - batch_size (int): The number of documents to accumulate into a working set. + batch_size (int): Number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer - in parallel, if the `Matcher` implementation supports multi-threading. + in parallel, if the implementation supports multi-threading. YIELDS (Doc): Documents, in order. """ for doc in docs: @@ -292,10 +317,10 @@ cdef class Matcher: yield doc def __call__(self, Doc doc): - """Find all token sequences matching the supplied patterns on the `Doc`. + """Find all token sequences matching the supplied pattern. doc (Doc): The document to match over. - RETURNS (list): A list of `(key, label_id, start, end)` tuples, + RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. """ @@ -309,8 +334,8 @@ cdef class Matcher: for token_i in range(doc.length): token = &doc.c[token_i] q = 0 - # Go over the open matches, extending or finalizing if able. Otherwise, - # we over-write them (q doesn't advance) + # Go over the open matches, extending or finalizing if able. + # Otherwise, we over-write them (q doesn't advance) for state in partials: action = get_action(state.second, token) if action == PANIC: @@ -318,10 +343,13 @@ cdef class Matcher: while action == ADVANCE_ZERO: state.second += 1 action = get_action(state.second, token) + if action == PANIC: + raise Exception("Error selecting action in matcher") + if action == REPEAT: # Leave the state in the queue, and advance to next slot - # (i.e. we don't overwrite -- we want to greedily match more - # pattern. + # (i.e. we don't overwrite -- we want to greedily match + # more pattern. q += 1 elif action == REJECT: pass @@ -329,14 +357,15 @@ cdef class Matcher: partials[q] = state partials[q].second += 1 q += 1 - elif action == ACCEPT: - # TODO: What to do about patterns starting with ZERO? Need to - # adjust the start position. + elif action in (ACCEPT, ACCEPT_PREV): + # TODO: What to do about patterns starting with ZERO? Need + # to adjust the start position. start = state.first - end = token_i+1 + end = token_i+1 if action == ACCEPT else token_i ent_id = state.second[1].attrs[0].value label = state.second[1].attrs[1].value matches.append((ent_id, start, end)) + partials.resize(q) # Check whether we open any new patterns on this token for pattern in self.patterns: @@ -351,20 +380,20 @@ cdef class Matcher: state.second = pattern partials.push_back(state) elif action == ADVANCE: - # TODO: What to do about patterns starting with ZERO? Need to - # adjust the start position. + # TODO: What to do about patterns starting with ZERO? Need + # to adjust the start position. state.first = token_i state.second = pattern + 1 partials.push_back(state) - elif action == ACCEPT: + elif action in (ACCEPT, ACCEPT_PREV): start = token_i - end = token_i+1 + end = token_i+1 if action == ACCEPT else token_i ent_id = pattern[1].attrs[0].value label = pattern[1].attrs[1].value matches.append((ent_id, start, end)) # Look for open patterns that are actually satisfied for state in partials: - while state.second.quantifier in (ZERO, ZERO_PLUS): + while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): state.second += 1 if state.second.nr_attr == 0: start = state.first @@ -376,7 +405,6 @@ cdef class Matcher: on_match = self._callbacks.get(ent_id) if on_match is not None: on_match(self, doc, i, matches) - # TODO: only return (match_id, start, end) return matches def _normalize_key(self, key): @@ -404,7 +432,8 @@ def get_bilou(length): elif length == 8: return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] elif length == 9: - return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, + L9_ENT] elif length == 10: return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, L10_ENT] @@ -417,67 +446,123 @@ cdef class PhraseMatcher: cdef Vocab vocab cdef Matcher matcher cdef PreshMap phrase_ids - cdef int max_length cdef attr_t* _phrase_key + cdef public object _callbacks + cdef public object _patterns - def __init__(self, Vocab vocab, phrases, max_length=10): + def __init__(self, Vocab vocab, max_length=10): self.mem = Pool() self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) self.max_length = max_length self.vocab = vocab - self.matcher = Matcher(self.vocab, {}) + self.matcher = Matcher(self.vocab) self.phrase_ids = PreshMap() - for phrase in phrases: - if len(phrase) < max_length: - self.add(phrase) - abstract_patterns = [] for length in range(1, max_length): - abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) - self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match) + abstract_patterns.append([{tag: True} + for tag in get_bilou(length)]) + self.matcher.add('Candidate', None, *abstract_patterns) + self._callbacks = {} - def add(self, Doc tokens): - cdef int length = tokens.length - assert length < self.max_length - tags = get_bilou(length) - assert len(tags) == length, length + def __len__(self): + """Get the number of rules added to the matcher. Note that this only + returns the number of rules (identical with the number of IDs), not the + number of individual patterns. + RETURNS (int): The number of rules. + """ + return len(self.phrase_ids) + + def __contains__(self, key): + """Check whether the matcher contains rules for a match ID. + + key (unicode): The match ID. + RETURNS (bool): Whether the matcher contains rules for this match ID. + """ + cdef hash_t ent_id = self.matcher._normalize_key(key) + return ent_id in self._callbacks + + def __reduce__(self): + return (self.__class__, (self.vocab,), None, None) + + def add(self, key, on_match, *docs): + """Add a match-rule to the matcher. A match-rule consists of: an ID + key, an on_match callback, and one or more patterns. + + key (unicode): The match ID. + on_match (callable): Callback executed on match. + *docs (Doc): `Doc` objects representing match patterns. + """ + cdef Doc doc + for doc in docs: + if len(doc) >= self.max_length: + msg = ( + "Pattern length (%d) >= phrase_matcher.max_length (%d). " + "Length can be set on initialization, up to 10." + ) + raise ValueError(msg % (len(doc), self.max_length)) + cdef hash_t ent_id = self.matcher._normalize_key(key) + self._callbacks[ent_id] = on_match + cdef int length cdef int i - for i in range(self.max_length): - self._phrase_key[i] = 0 - for i, tag in enumerate(tags): - lexeme = self.vocab[tokens.c[i].lex.orth] - lexeme.set_flag(tag, True) - self._phrase_key[i] = lexeme.orth - cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) - self.phrase_ids[key] = True + cdef hash_t phrase_hash + for doc in docs: + length = doc.length + tags = get_bilou(length) + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, tag in enumerate(tags): + lexeme = self.vocab[doc.c[i].lex.orth] + lexeme.set_flag(tag, True) + self._phrase_key[i] = lexeme.orth + phrase_hash = hash64(self._phrase_key, + self.max_length * sizeof(attr_t), 0) + self.phrase_ids.set(phrase_hash, ent_id) def __call__(self, Doc doc): + """Find all sequences matching the supplied patterns on the `Doc`. + + doc (Doc): The document to match over. + RETURNS (list): A list of `(key, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `key` are both integers. + """ matches = [] - for ent_id, label, start, end in self.matcher(doc): - cand = doc[start : end] - start = cand[0].idx - end = cand[-1].idx + len(cand[-1]) - matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) - for match in matches: - doc.merge(*match) + for _, start, end in self.matcher(doc): + ent_id = self.accept_match(doc, start, end) + if ent_id is not None: + matches.append((ent_id, start, end)) + for i, (ent_id, start, end) in enumerate(matches): + on_match = self._callbacks.get(ent_id) + if on_match is not None: + on_match(self, doc, i, matches) return matches def pipe(self, stream, batch_size=1000, n_threads=2): + """Match a stream of documents, yielding them in turn. + + docs (iterable): A stream of documents. + batch_size (int): Number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel, if the implementation supports multi-threading. + YIELDS (Doc): Documents, in order. + """ for doc in stream: self(doc) yield doc - def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end): + def accept_match(self, Doc doc, int start, int end): assert (end - start) < self.max_length cdef int i, j for i in range(self.max_length): self._phrase_key[i] = 0 for i, j in enumerate(range(start, end)): self._phrase_key[i] = doc.c[j].lex.orth - cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) - if self.phrase_ids.get(key): - return (ent_id, label, start, end) + cdef hash_t key = hash64(self._phrase_key, + self.max_length * sizeof(attr_t), 0) + ent_id = self.phrase_ids.get(key) + if ent_id == 0: + return None else: - return False + return ent_id diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 922843d6d..9192f351f 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,6 +35,8 @@ cdef class Morphology: cdef RichTagC* rich_tags cdef PreshMapArray _cache + cdef int assign_untagged(self, TokenC* token) except -1 + cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 @@ -42,7 +44,7 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 -cpdef enum univ_morph_t: +cdef enum univ_morph_t: NIL = 0 Animacy_anim = symbols.Animacy_anim Animacy_inam diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 13a0ed8e3..b3989839d 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,17 +4,15 @@ from __future__ import unicode_literals from libc.string cimport memset -from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT from .attrs cimport POS, IS_SPACE +from .attrs import LEMMA, intify_attrs +from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme -from .attrs import LEMMA, intify_attrs def _normalize_props(props): - """ - Transform deprecated string keys to correct names. - """ + """Transform deprecated string keys to correct names.""" out = {} for key, value in props.items(): if key == POS: @@ -36,14 +34,22 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + # Add special space symbol. We prefix with underscore, to make sure it + # always sorts to the end. + space_attrs = tag_map.get('SP', {POS: SPACE}) + if '_SP' not in tag_map: + self.strings.add('_SP') + tag_map = dict(tag_map) + tag_map['_SP'] = space_attrs + self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) - self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} - self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) + self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) @@ -52,6 +58,10 @@ cdef class Morphology: self.rich_tags[i].morph = 0 self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i + # Add a 'null' tag, which we can reference when assign morphology to + # untagged tokens. + self.rich_tags[self.n_tags].id = self.n_tags + self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: @@ -62,6 +72,16 @@ cdef class Morphology: return (Morphology, (self.strings, self.tag_map, self.lemmatizer, self.exc), None, None) + cdef int assign_untagged(self, TokenC* token) except -1: + """Set morphological attributes on a token without a POS tag. Uses + the lemmatizer's lookup() method, which looks up the string in the + table provided by the language data as lemma_lookup (if available). + """ + if token.lemma == 0: + orth_str = self.strings[token.lex.orth] + lemma = self.lemmatizer.lookup(orth_str) + token.lemma = self.strings.add(lemma) + cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): tag = self.strings.add(tag) @@ -72,15 +92,14 @@ cdef class Morphology: token.tag = tag cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: - if tag_id >= self.n_tags: + if tag_id > self.n_tags: raise ValueError("Unknown tag ID: %s" % tag_id) - # TODO: It's pretty arbitrary to put this logic here. I guess the justification - # is that this is where the specific word and the tag interact. Still, - # we should have a better way to enforce this rule, or figure out why - # the statistical model fails. - # Related to Issue #220 + # TODO: It's pretty arbitrary to put this logic here. I guess the + # justification is that this is where the specific word and the tag + # interact. Still, we should have a better way to enforce this rule, or + # figure out why the statistical model fails. Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('SP')] + tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: @@ -102,14 +121,13 @@ cdef class Morphology: else: flags[0] &= ~(one << flag_id) - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): - """ - Add a special-case rule to the morphological analyser. Tokens whose + def add_special_case(self, unicode tag_str, unicode orth_str, attrs, + force=False): + """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. - Arguments: - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. + tag (unicode): The part-of-speech tag to key the exception. + orth (unicode): The word-form to key the exception. """ self.exc[(tag_str, orth_str)] = dict(attrs) tag = self.strings.add(tag_str) @@ -123,10 +141,9 @@ cdef class Morphology: elif force: memset(cached, 0, sizeof(cached[0])) else: - msg = ("Conflicting morphology exception for (%s, %s). Use force=True " - "to overwrite.") - msg = msg % (tag_str, orth_str) - raise ValueError(msg) + raise ValueError( + "Conflicting morphology exception for (%s, %s). Use " + "force=True to overwrite." % (tag_str, orth_str)) cached.tag = rich_tag # TODO: Refactor this to take arbitrary attributes. @@ -146,12 +163,12 @@ cdef class Morphology: self.add_special_case(tag_str, form_str, attrs) def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): + if orth not in self.strings: + return orth cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - if univ_pos not in (NOUN, VERB, ADJ, PUNCT): - return self.strings.add(py_string.lower()) - cdef set lemma_strings + cdef list lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_string = sorted(lemma_strings)[0] @@ -197,7 +214,7 @@ IDS = { "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, - "Definite_cons": Definite_cons, # U20 + "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, @@ -206,7 +223,7 @@ IDS = { "Degree_sup": Degree_sup, "Degree_abs": Degree_abs, "Degree_com": Degree_com, - "Degree_dim ": Degree_dim, # du + "Degree_dim ": Degree_dim, # du "Gender_com": Gender_com, "Gender_fem": Gender_fem, "Gender_masc": Gender_masc, @@ -221,15 +238,15 @@ IDS = { "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, - "Polarity_neg": Polarity_neg, # U20 - "Polarity_pos": Polarity_pos, # U20 + "Polarity_neg": Polarity_neg, # U20 + "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, "Number_plur": Number_plur, "Number_sing": Number_sing, - "Number_ptan ": Number_ptan, # bg - "Number_count ": Number_count, # bg + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg "NumType_card": NumType_card, "NumType_dist": NumType_dist, "NumType_frac": NumType_frac, @@ -255,7 +272,7 @@ IDS = { "PronType_rel": PronType_rel, "PronType_tot": PronType_tot, "PronType_clit": PronType_clit, - "PronType_exc ": PronType_exc, # es, ca, it, fa, + "PronType_exc ": PronType_exc, # es, ca, it, fa, "Reflex_yes": Reflex_yes, "Tense_fut": Tense_fut, "Tense_imp": Tense_imp, @@ -271,19 +288,19 @@ IDS = { "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, # U20 - "VerbForm_gdv ": VerbForm_gdv, # la, + "VerbForm_conv": VerbForm_conv, # U20 + "VerbForm_gdv ": VerbForm_gdv, # la, "Voice_act": Voice_act, "Voice_cau": Voice_cau, "Voice_pass": Voice_pass, - "Voice_mid ": Voice_mid, # gkc, - "Voice_int ": Voice_int, # hb, - "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, - "AdpType_prep ": AdpType_prep, # cz, U, - "AdpType_post ": AdpType_post, # U, - "AdpType_voc ": AdpType_voc, # cz, - "AdpType_comprep ": AdpType_comprep, # cz, - "AdpType_circ ": AdpType_circ, # U, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, "AdvType_man": AdvType_man, "AdvType_loc": AdvType_loc, "AdvType_tim": AdvType_tim, @@ -293,123 +310,127 @@ IDS = { "AdvType_sta": AdvType_sta, "AdvType_ex": AdvType_ex, "AdvType_adadj": AdvType_adadj, - "ConjType_oper ": ConjType_oper, # cz, U, - "ConjType_comp ": ConjType_comp, # cz, U, - "Connegative_yes ": Connegative_yes, # fi, - "Derivation_minen ": Derivation_minen, # fi, - "Derivation_sti ": Derivation_sti, # fi, - "Derivation_inen ": Derivation_inen, # fi, - "Derivation_lainen ": Derivation_lainen, # fi, - "Derivation_ja ": Derivation_ja, # fi, - "Derivation_ton ": Derivation_ton, # fi, - "Derivation_vs ": Derivation_vs, # fi, - "Derivation_ttain ": Derivation_ttain, # fi, - "Derivation_ttaa ": Derivation_ttaa, # fi, - "Echo_rdp ": Echo_rdp, # U, - "Echo_ech ": Echo_ech, # U, - "Foreign_foreign ": Foreign_foreign, # cz, fi, U, - "Foreign_fscript ": Foreign_fscript, # cz, fi, U, - "Foreign_tscript ": Foreign_tscript, # cz, U, - "Foreign_yes ": Foreign_yes, # sl, - "Gender_dat_masc ": Gender_dat_masc, # bq, U, - "Gender_dat_fem ": Gender_dat_fem, # bq, U, - "Gender_erg_masc ": Gender_erg_masc, # bq, - "Gender_erg_fem ": Gender_erg_fem, # bq, - "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut ": Gender_psor_neut, # sl, - "Hyph_yes ": Hyph_yes, # cz, U, - "InfForm_one ": InfForm_one, # fi, - "InfForm_two ": InfForm_two, # fi, - "InfForm_three ": InfForm_three, # fi, - "NameType_geo ": NameType_geo, # U, cz, - "NameType_prs ": NameType_prs, # U, cz, - "NameType_giv ": NameType_giv, # U, cz, - "NameType_sur ": NameType_sur, # U, cz, - "NameType_nat ": NameType_nat, # U, cz, - "NameType_com ": NameType_com, # U, cz, - "NameType_pro ": NameType_pro, # U, cz, - "NameType_oth ": NameType_oth, # U, cz, - "NounType_com ": NounType_com, # U, - "NounType_prop ": NounType_prop, # U, - "NounType_class ": NounType_class, # U, - "Number_abs_sing ": Number_abs_sing, # bq, U, - "Number_abs_plur ": Number_abs_plur, # bq, U, - "Number_dat_sing ": Number_dat_sing, # bq, U, - "Number_dat_plur ": Number_dat_plur, # bq, U, - "Number_erg_sing ": Number_erg_sing, # bq, U, - "Number_erg_plur ": Number_erg_plur, # bq, U, - "Number_psee_sing ": Number_psee_sing, # U, - "Number_psee_plur ": Number_psee_plur, # U, - "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, - "NumForm_digit ": NumForm_digit, # cz, sl, U, - "NumForm_roman ": NumForm_roman, # cz, sl, U, - "NumForm_word ": NumForm_word, # cz, sl, U, - "NumValue_one ": NumValue_one, # cz, U, - "NumValue_two ": NumValue_two, # cz, U, - "NumValue_three ": NumValue_three, # cz, U, - "PartForm_pres ": PartForm_pres, # fi, - "PartForm_past ": PartForm_past, # fi, - "PartForm_agt ": PartForm_agt, # fi, - "PartForm_neg ": PartForm_neg, # fi, - "PartType_mod ": PartType_mod, # U, - "PartType_emp ": PartType_emp, # U, - "PartType_res ": PartType_res, # U, - "PartType_inf ": PartType_inf, # U, - "PartType_vbp ": PartType_vbp, # U, - "Person_abs_one ": Person_abs_one, # bq, U, - "Person_abs_two ": Person_abs_two, # bq, U, - "Person_abs_three ": Person_abs_three, # bq, U, - "Person_dat_one ": Person_dat_one, # bq, U, - "Person_dat_two ": Person_dat_two, # bq, U, - "Person_dat_three ": Person_dat_three, # bq, U, - "Person_erg_one ": Person_erg_one, # bq, U, - "Person_erg_two ": Person_erg_two, # bq, U, - "Person_erg_three ": Person_erg_three, # bq, U, - "Person_psor_one ": Person_psor_one, # fi, U, - "Person_psor_two ": Person_psor_two, # fi, U, - "Person_psor_three ": Person_psor_three, # fi, U, - "Polite_inf ": Polite_inf, # bq, U, - "Polite_pol ": Polite_pol, # bq, U, - "Polite_abs_inf ": Polite_abs_inf, # bq, U, - "Polite_abs_pol ": Polite_abs_pol, # bq, U, - "Polite_erg_inf ": Polite_erg_inf, # bq, U, - "Polite_erg_pol ": Polite_erg_pol, # bq, U, - "Polite_dat_inf ": Polite_dat_inf, # bq, U, - "Polite_dat_pol ": Polite_dat_pol, # bq, U, - "Prefix_yes ": Prefix_yes, # U, - "PrepCase_npr ": PrepCase_npr, # cz, - "PrepCase_pre ": PrepCase_pre, # U, - "PunctSide_ini ": PunctSide_ini, # U, - "PunctSide_fin ": PunctSide_fin, # U, - "PunctType_peri ": PunctType_peri, # U, - "PunctType_qest ": PunctType_qest, # U, - "PunctType_excl ": PunctType_excl, # U, - "PunctType_quot ": PunctType_quot, # U, - "PunctType_brck ": PunctType_brck, # U, - "PunctType_comm ": PunctType_comm, # U, - "PunctType_colo ": PunctType_colo, # U, - "PunctType_semi ": PunctType_semi, # U, - "PunctType_dash ": PunctType_dash, # U, - "Style_arch ": Style_arch, # cz, fi, U, - "Style_rare ": Style_rare, # cz, fi, U, - "Style_poet ": Style_poet, # cz, U, - "Style_norm ": Style_norm, # cz, U, - "Style_coll ": Style_coll, # cz, U, - "Style_vrnc ": Style_vrnc, # cz, U, - "Style_sing ": Style_sing, # cz, U, - "Style_expr ": Style_expr, # cz, U, - "Style_derg ": Style_derg, # cz, U, - "Style_vulg ": Style_vulg, # cz, U, - "Style_yes ": Style_yes, # fi, U, - "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, - "VerbType_aux ": VerbType_aux, # U, - "VerbType_cop ": VerbType_cop, # U, - "VerbType_mod ": VerbType_mod, # U, - "VerbType_light ": VerbType_light, # U, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, } NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +# Unfortunate hack here, to work around problem with long cpdef enum +# (which is generating an enormous amount of C++ in Cython 0.24+) +# We keep the enum cdef, and just make sure the names are available to Python +locals().update(IDS) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 38d5959b6..3925a6738 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -8,7 +8,7 @@ IDS = { "ADP": ADP, "ADV": ADV, "AUX": AUX, - "CONJ": CONJ, # U20 + "CONJ": CONJ, # U20 "CCONJ": CCONJ, "DET": DET, "INTJ": INTJ, diff --git a/spacy/pipeline.pxd b/spacy/pipeline.pxd index e9b7f0f73..e69de29bb 100644 --- a/spacy/pipeline.pxd +++ b/spacy/pipeline.pxd @@ -1,21 +0,0 @@ -from .syntax.parser cimport Parser -#from .syntax.beam_parser cimport BeamParser -from .syntax.ner cimport BiluoPushDown -from .syntax.arc_eager cimport ArcEager -from .tagger cimport Tagger - - -cdef class EntityRecognizer(Parser): - pass - - -cdef class DependencyParser(Parser): - pass - - -#cdef class BeamEntityRecognizer(BeamParser): -# pass -# -# -#cdef class BeamDependencyParser(BeamParser): -# pass diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7e00a443d..842e27069 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -3,60 +3,44 @@ # coding: utf8 from __future__ import unicode_literals -from thinc.api import chain, layerize, with_getitem -from thinc.neural import Model, Softmax import numpy cimport numpy as np import cytoolz -import util from collections import OrderedDict import ujson import msgpack -from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.neural import Model, Maxout, Softmax, Affine -from thinc.neural._classes.hash_embed import HashEmbed +from thinc.api import chain +from thinc.v2v import Softmax +from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural.util import to_categorical - -from thinc.neural.pooling import Pooling, max_pool, mean_pool from thinc.neural._classes.difference import Siamese, CauchySimilarity -from thinc.neural._classes.convolution import ExtractWindow -from thinc.neural._classes.resnet import Residual -from thinc.neural._classes.batchnorm import BatchNorm as BN - from .tokens.doc cimport Doc -from .syntax.parser cimport Parser as LinearParser -from .syntax.nn_parser cimport Parser as NeuralParser -from .syntax.parser import get_templates as get_feature_templates -from .syntax.beam_parser cimport BeamParser +from .syntax.nn_parser cimport Parser +from .syntax import nonproj from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager -from .tagger import Tagger -from .syntax.stateclass cimport StateClass -from .gold cimport GoldParse from .morphology cimport Morphology from .vocab cimport Vocab from .syntax import nonproj from .compat import json_dumps -from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS -from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats -from ._ml import build_text_classifier, build_tagger_model +from .attrs import POS from .parts_of_speech import X +from ._ml import Tok2Vec, build_text_classifier, build_tagger_model +from ._ml import link_vectors_to_models +from . import util class SentenceSegmenter(object): - '''A simple spaCy hook, to allow custom sentence boundary detection logic - (that doesn't require the dependency parse). - - To change the sentence boundary detection strategy, pass a generator - function `strategy` on initialization, or assign a new strategy to - the .strategy attribute. - + """A simple spaCy hook, to allow custom sentence boundary detection logic + (that doesn't require the dependency parse). To change the sentence + boundary detection strategy, pass a generator function `strategy` on + initialization, or assign a new strategy to the .strategy attribute. Sentence detection strategies should be generators that take `Doc` objects and yield `Span` objects for each sentence. - ''' + """ name = 'sbd' def __init__(self, vocab, strategy=None): @@ -67,6 +51,7 @@ class SentenceSegmenter(object): def __call__(self, doc): doc.user_hooks['sents'] = self.strategy + return doc @staticmethod def split_on_punct(doc): @@ -74,31 +59,48 @@ class SentenceSegmenter(object): seen_period = False for i, word in enumerate(doc): if seen_period and not word.is_punct: - yield doc[start : word.i] + yield doc[start:word.i] start = word.i seen_period = False elif word.text in ['.', '!', '?']: seen_period = True if start < len(doc): - yield doc[start : len(doc)] + yield doc[start:len(doc)] -class BaseThincComponent(object): +class Pipe(object): + """This class is not instantiated directly. Components inherit from it, and + it defines the interface that components should follow to function as + components in a spaCy analysis pipeline. + """ name = None @classmethod def Model(cls, *shape, **kwargs): + """Initialize a model for the pipe.""" raise NotImplementedError def __init__(self, vocab, model=True, **cfg): + """Create a new pipe instance.""" raise NotImplementedError def __call__(self, doc): + """Apply the pipe to one document. The document is + modified in-place, and returned. + + Both __call__ and pipe should delegate to the `predict()` + and `set_annotations()` methods. + """ scores = self.predict([doc]) self.set_annotations([doc], scores) return doc def pipe(self, stream, batch_size=128, n_threads=-1): + """Apply the pipe to a stream of documents. + + Both __call__ and pipe should delegate to the `predict()` + and `set_annotations()` methods. + """ for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) scores = self.predict(docs) @@ -106,66 +108,88 @@ class BaseThincComponent(object): yield from docs def predict(self, docs): + """Apply the pipeline's model to a batch of docs, without + modifying them. + """ raise NotImplementedError def set_annotations(self, docs, scores): + """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None): + """Learn from a batch of documents and gold-standard information, + updating the pipe's model. + + Delegates to predict() and get_loss(). + """ raise NotImplementedError def get_loss(self, docs, golds, scores): + """Find the loss and gradient of loss for the batch of + documents and their predicted scores.""" raise NotImplementedError def begin_training(self, gold_tuples=tuple(), pipeline=None): - token_vector_width = pipeline[0].model.nO + """Initialize the pipe for training, using data exampes if available. + If no model has been initialized yet, the model is added.""" if self.model is True: - self.model = self.Model(1, token_vector_width) + self.model = self.Model(**self.cfg) + link_vectors_to_models(self.vocab) def use_params(self, params): + """Modify the pipe's model, to use the given parameter values.""" with self.model.use_params(params): yield def to_bytes(self, **exclude): - serialize = OrderedDict(( - ('cfg', lambda: json_dumps(self.cfg)), - ('model', lambda: self.model.to_bytes()), - ('vocab', lambda: self.vocab.to_bytes()) - )) + """Serialize the pipe to a bytestring.""" + serialize = OrderedDict() + serialize['cfg'] = lambda: json_dumps(self.cfg) + if self.model in (True, False, None): + serialize['model'] = lambda: self.model + else: + serialize['model'] = self.model.to_bytes + serialize['vocab'] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): + """Load the pipe from a bytestring.""" def load_model(b): if self.model is True: + self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model = self.Model(**self.cfg) self.model.from_bytes(b) deserialize = OrderedDict(( ('cfg', lambda b: self.cfg.update(ujson.loads(b))), + ('vocab', lambda b: self.vocab.from_bytes(b)), ('model', load_model), - ('vocab', lambda b: self.vocab.from_bytes(b)) )) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, **exclude): - serialize = OrderedDict(( - ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), - ('model', lambda p: p.open('wb').write(self.model.to_bytes())), - ('vocab', lambda p: self.vocab.to_disk(p)) - )) + """Serialize the pipe to disk.""" + serialize = OrderedDict() + serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) + serialize['vocab'] = lambda p: self.vocab.to_disk(p) + if self.model not in (None, True, False): + serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): + """Load the pipe from disk.""" def load_model(p): if self.model is True: + self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model = self.Model(**self.cfg) self.model.from_bytes(p.open('rb').read()) deserialize = OrderedDict(( ('cfg', lambda p: self.cfg.update(_load_cfg(p))), - ('model', load_model), ('vocab', lambda p: self.vocab.from_disk(p)), + ('model', load_model), )) util.from_disk(path, deserialize, exclude) return self @@ -178,7 +202,7 @@ def _load_cfg(path): return {} -class TokenVectorEncoder(BaseThincComponent): +class Tensorizer(Pipe): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @@ -193,14 +217,14 @@ class TokenVectorEncoder(BaseThincComponent): """ width = util.env_opt('token_vector_width', width) embed_size = util.env_opt('embed_size', embed_size) - return Tok2Vec(width, embed_size, preprocess=None) + return Tok2Vec(width, embed_size, **cfg) def __init__(self, vocab, model=True, **cfg): """Construct a new statistical model. Weights are not allocated on initialisation. - vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` - instance with the `Doc` objects it will process. + vocab (Vocab): A `Vocab` instance. The model must share the same + `Vocab` instance with the `Doc` objects it will process. model (Model): A `Model` instance or `True` allocate one later. **cfg: Config parameters. @@ -210,9 +234,10 @@ class TokenVectorEncoder(BaseThincComponent): >>> tok2vec.model = tok2vec.Model(128, 5000) """ self.vocab = vocab - self.doc2feats = doc2feats() self.model = model self.cfg = dict(cfg) + self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] + self.cfg.setdefault('cnn_maxout_pieces', 3) def __call__(self, doc): """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM @@ -243,17 +268,16 @@ class TokenVectorEncoder(BaseThincComponent): """Return a single tensor for a batch of documents. docs (iterable): A sequence of `Doc` objects. - RETURNS (object): Vector representations for each token in the documents. + RETURNS (object): Vector representations for each token in the docs. """ - feats = self.doc2feats(docs) - tokvecs = self.model(feats) + tokvecs = self.model(docs) return tokvecs def set_annotations(self, docs, tokvecses): """Set the tensor attribute for a batch of documents. docs (iterable): A sequence of `Doc` objects. - tokvecs (object): Vector representation for each token in the documents. + tokvecs (object): Vector representation for each token in the docs. """ for doc, tokvecs in zip(docs, tokvecses): assert tokvecs.shape[0] == len(doc) @@ -270,8 +294,7 @@ class TokenVectorEncoder(BaseThincComponent): """ if isinstance(docs, Doc): docs = [docs] - feats = self.doc2feats(docs) - tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop) + tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop) return tokvecs, bp_tokvecs def get_loss(self, docs, golds, scores): @@ -285,40 +308,42 @@ class TokenVectorEncoder(BaseThincComponent): gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. """ - self.doc2feats = doc2feats() if self.model is True: - self.model = self.Model() + self.cfg['pretrained_dims'] = self.vocab.vectors_length + self.model = self.Model(**self.cfg) + link_vectors_to_models(self.vocab) -class NeuralTagger(BaseThincComponent): +class Tagger(Pipe): name = 'tagger' + def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = dict(cfg) + self.cfg.setdefault('cnn_maxout_pieces', 2) + self.cfg.setdefault('pretrained_dims', + self.vocab.vectors.data.shape[1]) def __call__(self, doc): - tags = self.predict(([doc], [doc.tensor])) + tags = self.predict([doc]) self.set_annotations([doc], tags) return doc def pipe(self, stream, batch_size=128, n_threads=-1): for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) - tokvecs = [d.tensor for d in docs] - tag_ids = self.predict((docs, tokvecs)) + tag_ids = self.predict(docs) self.set_annotations(docs, tag_ids) yield from docs - def predict(self, docs_tokvecs): - scores = self.model(docs_tokvecs) + def predict(self, docs): + scores = self.model(docs) scores = self.model.ops.flatten(scores) guesses = scores.argmax(axis=1) if not isinstance(guesses, numpy.ndarray): guesses = guesses.get() - tokvecs = docs_tokvecs[1] - guesses = self.model.ops.unflatten(guesses, - [tv.shape[0] for tv in tokvecs]) + guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs]) return guesses def set_annotations(self, docs, batch_tag_ids): @@ -338,25 +363,21 @@ class NeuralTagger(BaseThincComponent): idx += 1 doc.is_tagged = True - def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: losses[self.name] = 0. - docs, tokvecs = docs_tokvecs - if self.model.nI is None: - self.model.nI = tokvecs[0].shape[1] - tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) + tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) + bp_tag_scores(d_tag_scores, sgd=sgd) - d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) if losses is not None: losses[self.name] += loss - return d_tokvecs def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) - tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)} - + tag_index = {tag: i + for i, tag in enumerate(self.vocab.morphology.tag_names)} cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) @@ -385,41 +406,44 @@ class NeuralTagger(BaseThincComponent): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} - if 'SP' not in new_tag_map: - new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X}) cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) - token_vector_width = pipeline[0].model.nO if self.model is True: - self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) + self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] + self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + link_vectors_to_models(self.vocab) @classmethod - def Model(cls, n_tags, token_vector_width): - return build_tagger_model(n_tags, token_vector_width) - + def Model(cls, n_tags, **cfg): + return build_tagger_model(n_tags, **cfg) + def use_params(self, params): with self.model.use_params(params): yield def to_bytes(self, **exclude): - serialize = OrderedDict(( - ('model', lambda: self.model.to_bytes()), - ('vocab', lambda: self.vocab.to_bytes()), - ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map, - use_bin_type=True, - encoding='utf8')) - )) + serialize = OrderedDict() + if self.model in (None, True, False): + serialize['model'] = lambda: self.model + else: + serialize['model'] = self.model.to_bytes + serialize['vocab'] = self.vocab.to_bytes + + serialize['tag_map'] = lambda: msgpack.dumps( + self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8') return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): def load_model(b): if self.model is True: - token_vector_width = util.env_opt('token_vector_width', - self.cfg.get('token_vector_width', 128)) - self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) + token_vector_width = util.env_opt( + 'token_vector_width', + self.cfg.get('token_vector_width', 128)) + self.model = self.Model(self.vocab.morphology.n_tags, + **self.cfg) self.model.from_bytes(b) def load_tag_map(b): @@ -428,7 +452,7 @@ class NeuralTagger(BaseThincComponent): self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - + deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('tag_map', load_tag_map), @@ -438,6 +462,7 @@ class NeuralTagger(BaseThincComponent): return self def to_disk(self, path, **exclude): + self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] serialize = OrderedDict(( ('vocab', lambda p: self.vocab.to_disk(p)), ('tag_map', lambda p: p.open('wb').write(msgpack.dumps( @@ -452,9 +477,7 @@ class NeuralTagger(BaseThincComponent): def from_disk(self, path, **exclude): def load_model(p): if self.model is True: - token_vector_width = util.env_opt('token_vector_width', - self.cfg.get('token_vector_width', 128)) - self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) + self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model.from_bytes(p.open('rb').read()) def load_tag_map(p): @@ -466,21 +489,43 @@ class NeuralTagger(BaseThincComponent): exc=self.vocab.morphology.exc) deserialize = OrderedDict(( + ('cfg', lambda p: self.cfg.update(_load_cfg(p))), ('vocab', lambda p: self.vocab.from_disk(p)), ('tag_map', load_tag_map), ('model', load_model), - ('cfg', lambda p: self.cfg.update(_load_cfg(p))) )) util.from_disk(path, deserialize, exclude) return self -class NeuralLabeller(NeuralTagger): +class MultitaskObjective(Tagger): + """Experimental: Assist training of a parser or tagger, by training a + side-objective. + """ name = 'nn_labeller' - def __init__(self, vocab, model=True, **cfg): + + def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model + if target == 'dep': + self.make_label = self.make_dep + elif target == 'tag': + self.make_label = self.make_tag + elif target == 'ent': + self.make_label = self.make_ent + elif target == 'dep_tag_offset': + self.make_label = self.make_dep_tag_offset + elif target == 'ent_tag': + self.make_label = self.make_ent_tag + elif hasattr(target, '__call__'): + self.make_label = target + else: + raise ValueError("MultitaskObjective target should be function or " + "one of: dep, tag, ent, dep_tag_offset, ent_tag.") self.cfg = dict(cfg) + self.cfg.setdefault('cnn_maxout_pieces', 2) + self.cfg.setdefault('pretrained_dims', + self.vocab.vectors.data.shape[1]) @property def labels(self): @@ -493,58 +538,94 @@ class NeuralLabeller(NeuralTagger): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, gold_tuples=tuple(), pipeline=None): + def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None): gold_tuples = nonproj.preprocess_training_data(gold_tuples) for raw_text, annots_brackets in gold_tuples: for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - for dep in deps: - if dep not in self.labels: - self.labels[dep] = len(self.labels) - token_vector_width = pipeline[0].model.nO + for i in range(len(ids)): + label = self.make_label(i, words, tags, heads, deps, ents) + if label is not None and label not in self.labels: + self.labels[label] = len(self.labels) if self.model is True: - self.model = self.Model(len(self.labels), token_vector_width) + token_vector_width = util.env_opt('token_vector_width') + self.model = chain( + tok2vec, + Softmax(len(self.labels), token_vector_width) + ) + link_vectors_to_models(self.vocab) @classmethod - def Model(cls, n_tags, token_vector_width): - return build_tagger_model(n_tags, token_vector_width) - + def Model(cls, n_tags, tok2vec=None, **cfg): + return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg) + def get_loss(self, docs, golds, scores): - scores = self.model.ops.flatten(scores) cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) for gold in golds: - for tag in gold.labels: - if tag is None or tag not in self.labels: + for i in range(len(gold.labels)): + label = self.make_label(i, gold.words, gold.tags, gold.heads, + gold.labels, gold.ents) + if label is None or label not in self.labels: correct[idx] = guesses[idx] else: - correct[idx] = self.labels[tag] + correct[idx] = self.labels[label] idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores + @staticmethod + def make_dep(i, words, tags, heads, deps, ents): + if deps[i] is None or heads[i] is None: + return None + return deps[i] -class SimilarityHook(BaseThincComponent): + @staticmethod + def make_tag(i, words, tags, heads, deps, ents): + return tags[i] + + @staticmethod + def make_ent(i, words, tags, heads, deps, ents): + if ents is None: + return None + return ents[i] + + @staticmethod + def make_dep_tag_offset(i, words, tags, heads, deps, ents): + if deps[i] is None or heads[i] is None: + return None + offset = heads[i] - i + offset = min(offset, 2) + offset = max(offset, -2) + return '%s-%s:%d' % (deps[i], tags[i], offset) + + @staticmethod + def make_ent_tag(i, words, tags, heads, deps, ents): + if ents is None or ents[i] is None: + return None + else: + return '%s-%s' % (tags[i], ents[i]) + + +class SimilarityHook(Pipe): """ - Experimental + Experimental: A pipeline component to install a hook for supervised + similarity into `Doc` objects. Requires a `Tensorizer` to pre-process + documents. The similarity model can be any object obeying the Thinc `Model` + interface. By default, the model concatenates the elementwise mean and + elementwise max of the two tensors, and compares them using the + Cauchy-like similarity function from Chen (2013): - A pipeline component to install a hook for supervised similarity into - Doc objects. Requires a Tensorizer to pre-process documents. The similarity - model can be any object obeying the Thinc Model interface. By default, - the model concatenates the elementwise mean and elementwise max of the two - tensors, and compares them using the Cauchy-like similarity function - from Chen (2013): - - similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) + >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) Where W is a vector of dimension weights, initialized to 1. """ name = 'similarity' + def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model @@ -555,7 +636,7 @@ class SimilarityHook(BaseThincComponent): return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) def __call__(self, doc): - '''Install similarity hook''' + """Install similarity hook""" doc.user_hooks['similarity'] = self.predict return doc @@ -564,28 +645,23 @@ class SimilarityHook(BaseThincComponent): yield self(doc) def predict(self, doc1, doc2): - return self.model.predict([(doc1.tensor, doc2.tensor)]) + return self.model.predict([(doc1, doc2)]) - def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.): - doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2 - sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s), - drop=drop) - d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd) - - return d_tensor1s, d_tensor2s + def update(self, doc1_doc2, golds, sgd=None, drop=0.): + sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) def begin_training(self, _=tuple(), pipeline=None): - """ - Allocate model, using width from tensorizer in pipeline. + """Allocate model, using width from tensorizer in pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. """ if self.model is True: self.model = self.Model(pipeline[0].model.nO) + link_vectors_to_models(self.vocab) -class TextCategorizer(BaseThincComponent): +class TextCategorizer(Pipe): name = 'textcat' @classmethod @@ -627,23 +703,27 @@ class TextCategorizer(BaseThincComponent): for j, label in enumerate(self.labels): doc.cats[label] = float(scores[i, j]) - def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): - docs, tensors = docs_tensors + def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): scores, bp_scores = self.model.begin_update(docs, drop=drop) loss, d_scores = self.get_loss(docs, golds, scores) - d_tensors = bp_scores(d_scores, sgd=sgd) + bp_scores(d_scores, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss - return d_tensors def get_loss(self, docs, golds, scores): truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') + not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') for i, gold in enumerate(golds): for j, label in enumerate(self.labels): - truths[i, j] = label in gold.cats + if label in gold.cats: + truths[i, j] = gold.cats[label] + else: + not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) + not_missing = self.model.ops.asarray(not_missing) d_scores = (scores-truths) / scores.shape[0] + d_scores *= not_missing mean_square_error = ((scores-truths)**2).sum(axis=1).mean() return mean_square_error, d_scores @@ -653,80 +733,52 @@ class TextCategorizer(BaseThincComponent): else: token_vector_width = 64 if self.model is True: + self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model = self.Model(len(self.labels), token_vector_width, **self.cfg) + link_vectors_to_models(self.vocab) -cdef class EntityRecognizer(LinearParser): - """Annotate named entities on Doc objects.""" - TransitionSystem = BiluoPushDown - - feature_templates = get_feature_templates('ner') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class BeamEntityRecognizer(BeamParser): - """Annotate named entities on Doc objects.""" - TransitionSystem = BiluoPushDown - - feature_templates = get_feature_templates('ner') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class DependencyParser(LinearParser): - TransitionSystem = ArcEager - feature_templates = get_feature_templates('basic') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class NeuralDependencyParser(NeuralParser): +cdef class DependencyParser(Parser): name = 'parser' TransitionSystem = ArcEager + @property + def postprocesses(self): + return [nonproj.deprojectivize] + + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + for target in []: + labeller = MultitaskObjective(self.vocab, target=target) + tok2vec = self.model[0] + labeller.begin_training(gold_tuples, pipeline=pipeline, + tok2vec=tok2vec) + pipeline.append(labeller) + self._multitasks.append(labeller) + def __reduce__(self): - return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) + return (DependencyParser, (self.vocab, self.moves, self.model), + None, None) -cdef class NeuralEntityRecognizer(NeuralParser): +cdef class EntityRecognizer(Parser): name = 'ner' TransitionSystem = BiluoPushDown nr_feature = 6 - def predict_confidences(self, docs): - tensors = [d.tensor for d in docs] - samples = [] - for i in range(10): - states = self.parse_batch(docs, tensors, drop=0.3) - for state in states: - samples.append(self._get_entities(state)) + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + for target in []: + labeller = MultitaskObjective(self.vocab, target=target) + tok2vec = self.model[0] + labeller.begin_training(gold_tuples, pipeline=pipeline, + tok2vec=tok2vec) + pipeline.append(labeller) + self._multitasks.append(labeller) def __reduce__(self): - return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) + return (EntityRecognizer, (self.vocab, self.moves, self.model), + None, None) -cdef class BeamDependencyParser(BeamParser): - TransitionSystem = ArcEager - - feature_templates = get_feature_templates('basic') - - def add_label(self, label): - Parser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', - 'BeamEntityRecognizer', 'TokenVectorEnoder'] +__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer'] diff --git a/spacy/scorer.py b/spacy/scorer.py index b1ce3faa4..673df132c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -74,18 +74,21 @@ class Scorer(object): @property def scores(self): return { - 'uas': self.uas, 'las': self.las, - 'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f, + 'uas': self.uas, + 'las': self.las, + 'ents_p': self.ents_p, + 'ents_r': self.ents_r, + 'ents_f': self.ents_f, 'tags_acc': self.tags_acc, 'token_acc': self.token_acc } def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): assert len(tokens) == len(gold) - gold_deps = set() gold_tags = set() - gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) + gold_ents = set(tags_to_entities([annot[-1] + for annot in gold.orig_annot])) for id_, word, tag, head, dep, ner in gold.orig_annot: gold_tags.add((id_, tag)) if dep not in (None, "") and dep.lower() not in punct_labels: diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 0ad403cf1..4f987baed 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -21,11 +21,9 @@ ctypedef union Utf8Str: cdef class StringStore: cdef Pool mem - cdef bint is_frozen cdef vector[hash_t] keys cdef public PreshMap _map - cdef public PreshMap _oov cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 6f676c79a..647f140bb 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import cimport cython from libc.string cimport memcpy -from libc.stdint cimport uint64_t, uint32_t -from murmurhash.mrmr cimport hash64, hash32 -from preshed.maps cimport map_iter, key_t from libc.stdint cimport uint32_t +from murmurhash.mrmr cimport hash64, hash32 import ujson -import dill from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT - from .typedefs cimport hash_t -from . import util from .compat import json_dumps +from . import util cpdef hash_t hash_string(unicode string) except 0: @@ -86,8 +82,6 @@ cdef class StringStore: """ self.mem = Pool() self._map = PreshMap() - self._oov = PreshMap() - self.is_frozen = freeze if strings is not None: for string in strings: self.add(string) @@ -197,7 +191,7 @@ cdef class StringStore: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) strings = list(self) @@ -227,7 +221,7 @@ cdef class StringStore: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `StringStore` object. """ - return ujson.dumps(list(self)) + return json_dumps(list(self)) def from_bytes(self, bytes_data, **exclude): """Load state from a binary string. @@ -243,21 +237,12 @@ cdef class StringStore: self.add(word) return self - def set_frozen(self, bint is_frozen): - # TODO - self.is_frozen = is_frozen - - def flush_oov(self): - self._oov = PreshMap() - - def _reset_and_load(self, strings, freeze=False): + def _reset_and_load(self, strings): self.mem = Pool() self._map = PreshMap() - self._oov = PreshMap() self.keys.clear() for string in strings: self.add(string) - self.is_frozen = freeze cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. @@ -272,18 +257,6 @@ cdef class StringStore: cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value - value = self._oov.get(key) - if value is not NULL: - return value - if self.is_frozen: - # OOV store uses 32 bit hashes. Pretty ugly :( - key32 = hash32_utf8(utf8_string, length) - # Important: Make the OOV store own the memory. That way it's trivial - # to flush them all. - value = _allocate(self._oov.mem, utf8_string, length) - self._oov.set(key32, value) - return NULL - value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) self.keys.push_back(key) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 3c60cd87f..cfcadc3d0 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -61,13 +61,13 @@ cdef struct TokenC: attr_t sense int head attr_t dep - bint sent_start uint32_t l_kids uint32_t r_kids uint32_t l_edge uint32_t r_edge + int sent_start int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. hash_t ent_id diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 0b713cb21..6960681a3 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -1,4 +1,4 @@ -cpdef enum symbol_t: +cdef enum symbol_t: NIL IS_ALPHA IS_ASCII @@ -13,12 +13,12 @@ cpdef enum symbol_t: LIKE_EMAIL IS_STOP IS_OOV + IS_BRACKET + IS_QUOTE + IS_LEFT_PUNCT + IS_RIGHT_PUNCT - FLAG14 = 14 - FLAG15 - FLAG16 - FLAG17 - FLAG18 + FLAG18 = 18 FLAG19 FLAG20 FLAG21 @@ -455,15 +455,5 @@ cpdef enum symbol_t: root xcomp -# Move these up to FLAG14--FLAG18 once we finish the functionality -# and are ready to regenerate the model. -#IS_BRACKET -#IS_QUOTE -#IS_LEFT_PUNCT -#IS_RIGHT_PUNCT - -# These symbols are currently missing. However, if we add them currently, -# we'll throw off the integer index and the model will have to be retrained. -# We therefore wait until the next data version to add them. -# acl - + acl + LAW diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 9f4009579..56422771a 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,6 +1,8 @@ # coding: utf8 +#cython: optimize.unpack_method_calls=False from __future__ import unicode_literals + IDS = { "": NIL, "IS_ALPHA": IS_ALPHA, @@ -16,10 +18,11 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, - "FLAG15": FLAG15, - "FLAG16": FLAG16, - "FLAG17": FLAG17, + "IS_BRACKET": IS_BRACKET, + "IS_QUOTE": IS_QUOTE, + "IS_LEFT_PUNCT": IS_LEFT_PUNCT, + "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, + "FLAG18": FLAG18, "FLAG19": FLAG19, "FLAG20": FLAG20, @@ -161,7 +164,7 @@ IDS = { "Degree_sup": Degree_sup, "Degree_abs": Degree_abs, "Degree_com": Degree_com, - "Degree_dim ": Degree_dim, # du + "Degree_dim": Degree_dim, # du "Degree_equ": Degree_equ, # U20 "Evident_nfh": Evident_nfh, # U20 "Gender_com": Gender_com, @@ -187,8 +190,8 @@ IDS = { "Number_none": Number_none, "Number_plur": Number_plur, "Number_sing": Number_sing, - "Number_ptan ": Number_ptan, # bg - "Number_count ": Number_count, # bg, U20 + "Number_ptan": Number_ptan, # bg + "Number_count": Number_count, # bg, U20 "Number_tri": Number_tri, # U20 "NumType_card": NumType_card, "NumType_dist": NumType_dist, @@ -233,22 +236,22 @@ IDS = { "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, "VerbForm_conv": VerbForm_conv, # U20 - "VerbForm_gdv ": VerbForm_gdv, # la, + "VerbForm_gdv": VerbForm_gdv, # la, "VerbForm_vnoun": VerbForm_vnoun, # U20 "Voice_act": Voice_act, "Voice_cau": Voice_cau, "Voice_pass": Voice_pass, - "Voice_mid ": Voice_mid, # gkc, U20 - "Voice_int ": Voice_int, # hb, + "Voice_mid": Voice_mid, # gkc, U20 + "Voice_int": Voice_int, # hb, "Voice_antip": Voice_antip, # U20 "Voice_dir": Voice_dir, # U20 "Voice_inv": Voice_inv, # U20 - "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, - "AdpType_prep ": AdpType_prep, # cz, U, - "AdpType_post ": AdpType_post, # U, - "AdpType_voc ": AdpType_voc, # cz, - "AdpType_comprep ": AdpType_comprep, # cz, - "AdpType_circ ": AdpType_circ, # U, + "Abbr_yes": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep": AdpType_prep, # cz, U, + "AdpType_post": AdpType_post, # U, + "AdpType_voc": AdpType_voc, # cz, + "AdpType_comprep": AdpType_comprep, # cz, + "AdpType_circ": AdpType_circ, # U, "AdvType_man": AdvType_man, "AdvType_loc": AdvType_loc, "AdvType_tim": AdvType_tim, @@ -258,56 +261,56 @@ IDS = { "AdvType_sta": AdvType_sta, "AdvType_ex": AdvType_ex, "AdvType_adadj": AdvType_adadj, - "ConjType_oper ": ConjType_oper, # cz, U, - "ConjType_comp ": ConjType_comp, # cz, U, - "Connegative_yes ": Connegative_yes, # fi, - "Derivation_minen ": Derivation_minen, # fi, - "Derivation_sti ": Derivation_sti, # fi, - "Derivation_inen ": Derivation_inen, # fi, - "Derivation_lainen ": Derivation_lainen, # fi, - "Derivation_ja ": Derivation_ja, # fi, - "Derivation_ton ": Derivation_ton, # fi, - "Derivation_vs ": Derivation_vs, # fi, - "Derivation_ttain ": Derivation_ttain, # fi, - "Derivation_ttaa ": Derivation_ttaa, # fi, - "Echo_rdp ": Echo_rdp, # U, - "Echo_ech ": Echo_ech, # U, - "Foreign_foreign ": Foreign_foreign, # cz, fi, U, - "Foreign_fscript ": Foreign_fscript, # cz, fi, U, - "Foreign_tscript ": Foreign_tscript, # cz, U, - "Foreign_yes ": Foreign_yes, # sl, - "Gender_dat_masc ": Gender_dat_masc, # bq, U, - "Gender_dat_fem ": Gender_dat_fem, # bq, U, - "Gender_erg_masc ": Gender_erg_masc, # bq, - "Gender_erg_fem ": Gender_erg_fem, # bq, - "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut ": Gender_psor_neut, # sl, - "Hyph_yes ": Hyph_yes, # cz, U, - "InfForm_one ": InfForm_one, # fi, - "InfForm_two ": InfForm_two, # fi, - "InfForm_three ": InfForm_three, # fi, - "NameType_geo ": NameType_geo, # U, cz, - "NameType_prs ": NameType_prs, # U, cz, - "NameType_giv ": NameType_giv, # U, cz, - "NameType_sur ": NameType_sur, # U, cz, - "NameType_nat ": NameType_nat, # U, cz, - "NameType_com ": NameType_com, # U, cz, - "NameType_pro ": NameType_pro, # U, cz, - "NameType_oth ": NameType_oth, # U, cz, - "NounType_com ": NounType_com, # U, - "NounType_prop ": NounType_prop, # U, - "NounType_class ": NounType_class, # U, - "Number_abs_sing ": Number_abs_sing, # bq, U, - "Number_abs_plur ": Number_abs_plur, # bq, U, - "Number_dat_sing ": Number_dat_sing, # bq, U, - "Number_dat_plur ": Number_dat_plur, # bq, U, - "Number_erg_sing ": Number_erg_sing, # bq, U, - "Number_erg_plur ": Number_erg_plur, # bq, U, - "Number_psee_sing ": Number_psee_sing, # U, - "Number_psee_plur ": Number_psee_plur, # U, - "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "ConjType_oper": ConjType_oper, # cz, U, + "ConjType_comp": ConjType_comp, # cz, U, + "Connegative_yes": Connegative_yes, # fi, + "Derivation_minen": Derivation_minen, # fi, + "Derivation_sti": Derivation_sti, # fi, + "Derivation_inen": Derivation_inen, # fi, + "Derivation_lainen": Derivation_lainen, # fi, + "Derivation_ja": Derivation_ja, # fi, + "Derivation_ton": Derivation_ton, # fi, + "Derivation_vs": Derivation_vs, # fi, + "Derivation_ttain": Derivation_ttain, # fi, + "Derivation_ttaa": Derivation_ttaa, # fi, + "Echo_rdp": Echo_rdp, # U, + "Echo_ech": Echo_ech, # U, + "Foreign_foreign": Foreign_foreign, # cz, fi, U, + "Foreign_fscript": Foreign_fscript, # cz, fi, U, + "Foreign_tscript": Foreign_tscript, # cz, U, + "Foreign_yes": Foreign_yes, # sl, + "Gender_dat_masc": Gender_dat_masc, # bq, U, + "Gender_dat_fem": Gender_dat_fem, # bq, U, + "Gender_erg_masc": Gender_erg_masc, # bq, + "Gender_erg_fem": Gender_erg_fem, # bq, + "Gender_psor_masc": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut": Gender_psor_neut, # sl, + "Hyph_yes": Hyph_yes, # cz, U, + "InfForm_one": InfForm_one, # fi, + "InfForm_two": InfForm_two, # fi, + "InfForm_three": InfForm_three, # fi, + "NameType_geo": NameType_geo, # U, cz, + "NameType_prs": NameType_prs, # U, cz, + "NameType_giv": NameType_giv, # U, cz, + "NameType_sur": NameType_sur, # U, cz, + "NameType_nat": NameType_nat, # U, cz, + "NameType_com": NameType_com, # U, cz, + "NameType_pro": NameType_pro, # U, cz, + "NameType_oth": NameType_oth, # U, cz, + "NounType_com": NounType_com, # U, + "NounType_prop": NounType_prop, # U, + "NounType_class": NounType_class, # U, + "Number_abs_sing": Number_abs_sing, # bq, U, + "Number_abs_plur": Number_abs_plur, # bq, U, + "Number_dat_sing": Number_dat_sing, # bq, U, + "Number_dat_plur": Number_dat_plur, # bq, U, + "Number_erg_sing": Number_erg_sing, # bq, U, + "Number_erg_plur": Number_erg_plur, # bq, U, + "Number_psee_sing": Number_psee_sing, # U, + "Number_psee_plur": Number_psee_plur, # U, + "Number_psor_sing": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur": Number_psor_plur, # cz, fi, sl, U, "Number_pauc": Number_pauc, # U20 "Number_grpa": Number_grpa, # U20 "Number_grpl": Number_grpl, # U20 @@ -352,7 +355,7 @@ IDS = { "Polite_infm": Polite_infm, # U20 "Polite_form": Polite_form, # U20 "Polite_form_elev": Polite_form_elev, # U20 - "Polite_form_humb ": Polite_form_humb, # U20 + "Polite_form_humb": Polite_form_humb, # U20 "Prefix_yes": Prefix_yes, # U, "PrepCase_npr": PrepCase_npr, # cz, "PrepCase_pre": PrepCase_pre, # U, @@ -455,7 +458,19 @@ IDS = { "quantmod": quantmod, "rcmod": rcmod, "root": root, - "xcomp": xcomp + "xcomp": xcomp, + + "acl": acl, + "LAW": LAW } -NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] + +def sort_nums(x): + return x[1] + + +NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] +# Unfortunate hack here, to work around problem with long cpdef enum +# (which is generating an enormous amount of C++ in Cython 0.24+) +# We keep the enum cdef, and just make sure the names are available to Python +locals().update(IDS) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 4d90fe23b..54e72a0e8 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -2,7 +2,7 @@ # cython: profile=True cimport numpy as np import numpy -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from cpython.ref cimport PyObject, Py_XDECREF from thinc.extra.search cimport Beam from thinc.extra.search import MaxViolation from thinc.typedefs cimport hash_t, class_t @@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation from .transition_system cimport TransitionSystem, Transition from .stateclass cimport StateClass from ..gold cimport GoldParse -from ..tokens.doc cimport Doc # These are passed as callbacks to thinc.search.Beam @@ -21,6 +20,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) moves = _moves dest.clone(src) moves[clas].do(dest.c, moves[clas].label) + dest.c.push_hist(clas) cdef int _check_final_state(void* _state, void* extra_args) except -1: @@ -49,7 +49,7 @@ cdef class ParserBeam(object): cdef public object dones def __init__(self, TransitionSystem moves, states, golds, - int width, float density): + int width, float density): self.moves = moves self.states = states self.golds = golds @@ -58,7 +58,8 @@ cdef class ParserBeam(object): cdef StateClass state, st for state in states: beam = Beam(self.moves.n_moves, width, density) - beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) + beam.initialize(self.moves.init_beam_state, state.c.length, + state.c._sent) for i in range(beam.width): st = beam.at(i) st.c.offset = state.c.offset @@ -73,7 +74,8 @@ cdef class ParserBeam(object): @property def is_done(self): - return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) + return all(b.is_done or self.dones[i] + for i, b in enumerate(self.beams)) def __getitem__(self, i): return self.beams[i] @@ -125,7 +127,8 @@ cdef class ParserBeam(object): for i in range(beam.size): state = beam.at(i) if not state.c.is_final(): - self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) + self.moves.set_costs(beam.is_valid[i], beam.costs[i], + state, gold) if follow_gold: for j in range(beam.nr_class): if beam.costs[i][j] >= 1: @@ -145,12 +148,15 @@ def get_token_ids(states, int n_tokens): c_ids += ids.shape[1] return ids + nr_update = 0 + + def update_beam(TransitionSystem moves, int nr_feature, int max_steps, - states, tokvecs, golds, - state2vec, vec2scores, - int width, float density, - sgd=None, losses=None, drop=0.): + states, golds, + state2vec, vec2scores, + int width, float density, int hist_feats, + losses=None, drop=0.): global nr_update cdef MaxViolation violn nr_update += 1 @@ -166,29 +172,39 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, if pbeam.is_done and gbeam.is_done: break # The beam maps let us find the right row in the flattened scores - # arrays for each state. States are identified by (example id, history). - # We keep a different beam map for each step (since we'll have a flat - # scores array for each step). The beam map will let us take the per-state - # losses, and compute the gradient for each (step, state, class). + # arrays for each state. States are identified by (example id, + # history). We keep a different beam map for each step (since we'll + # have a flat scores array for each step). The beam map will let us + # take the per-state losses, and compute the gradient for each (step, + # state, class). beam_maps.append({}) # Gather all states from the two beams in a list. Some stats may occur # in both beams. To figure out which beam each state belonged to, # we keep two lists of indices, p_indices and g_indices - states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) + states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], + nr_update) if not states: break # Now that we have our flat list of states, feed them through the model token_ids = get_token_ids(states, nr_feature) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) - scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) + if hist_feats: + hists = numpy.asarray([st.history[:hist_feats] for st in states], + dtype='i') + scores, bp_scores = vec2scores.begin_update((vectors, hists), + drop=drop) + else: + scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) # Store the callbacks for the backward pass backprops.append((token_ids, bp_vectors, bp_scores)) # Unpack the flat scores into lists for the two beams. The indices arrays # tell us which example and state the scores-row refers to. - p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] - g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] + p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') + for indices in p_indices] + g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') + for indices in g_indices] # Now advance the states in the beams. The gold beam is contrained to # to follow only gold analyses. pbeam.advance(p_scores) @@ -244,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update): def get_gradient(nr_class, beam_maps, histories, losses): - """ - The global model assigns a loss to each parse. The beam scores + """The global model assigns a loss to each parse. The beam scores are additive, so the same gradient is applied to each action in the history. This gives the gradient of a single *action* for a beam state -- so we have "the gradient of loss for taking @@ -265,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses): if loss != 0.0 and not numpy.isnan(loss): nr_step = max(nr_step, len(hist)) for i in range(nr_step): - grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) + grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), + dtype='f')) assert len(histories) == len(losses) for eg_id, hists in enumerate(histories): for loss, hist in zip(losses[eg_id], hists): @@ -282,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses): grads[j][i, clas] += loss key = key + tuple([clas]) return grads - - diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd deleted file mode 100644 index 0842e3504..000000000 --- a/spacy/syntax/_parse_features.pxd +++ /dev/null @@ -1,259 +0,0 @@ -from thinc.typedefs cimport atom_t - -from .stateclass cimport StateClass -from ._state cimport StateC - - -cdef int fill_context(atom_t* context, const StateC* state) nogil -# Context elements - -# Ensure each token's attributes are listed: w, p, c, c6, c4. The order -# is referenced by incrementing the enum... - -# Tokens are listed in left-to-right order. -#cdef size_t* SLOTS = [ -# S2w, S1w, -# S0l0w, S0l2w, S0lw, -# S0w, -# S0r0w, S0r2w, S0rw, -# N0l0w, N0l2w, N0lw, -# P2w, P1w, -# N0w, N1w, N2w, N3w, 0 -#] - -# NB: The order of the enum is _NOT_ arbitrary!! -cpdef enum: - S2w - S2W - S2p - S2c - S2c4 - S2c6 - S2L - S2_prefix - S2_suffix - S2_shape - S2_ne_iob - S2_ne_type - - S1w - S1W - S1p - S1c - S1c4 - S1c6 - S1L - S1_prefix - S1_suffix - S1_shape - S1_ne_iob - S1_ne_type - - S1rw - S1rW - S1rp - S1rc - S1rc4 - S1rc6 - S1rL - S1r_prefix - S1r_suffix - S1r_shape - S1r_ne_iob - S1r_ne_type - - S0lw - S0lW - S0lp - S0lc - S0lc4 - S0lc6 - S0lL - S0l_prefix - S0l_suffix - S0l_shape - S0l_ne_iob - S0l_ne_type - - S0l2w - S0l2W - S0l2p - S0l2c - S0l2c4 - S0l2c6 - S0l2L - S0l2_prefix - S0l2_suffix - S0l2_shape - S0l2_ne_iob - S0l2_ne_type - - S0w - S0W - S0p - S0c - S0c4 - S0c6 - S0L - S0_prefix - S0_suffix - S0_shape - S0_ne_iob - S0_ne_type - - S0r2w - S0r2W - S0r2p - S0r2c - S0r2c4 - S0r2c6 - S0r2L - S0r2_prefix - S0r2_suffix - S0r2_shape - S0r2_ne_iob - S0r2_ne_type - - S0rw - S0rW - S0rp - S0rc - S0rc4 - S0rc6 - S0rL - S0r_prefix - S0r_suffix - S0r_shape - S0r_ne_iob - S0r_ne_type - - N0l2w - N0l2W - N0l2p - N0l2c - N0l2c4 - N0l2c6 - N0l2L - N0l2_prefix - N0l2_suffix - N0l2_shape - N0l2_ne_iob - N0l2_ne_type - - N0lw - N0lW - N0lp - N0lc - N0lc4 - N0lc6 - N0lL - N0l_prefix - N0l_suffix - N0l_shape - N0l_ne_iob - N0l_ne_type - - N0w - N0W - N0p - N0c - N0c4 - N0c6 - N0L - N0_prefix - N0_suffix - N0_shape - N0_ne_iob - N0_ne_type - - N1w - N1W - N1p - N1c - N1c4 - N1c6 - N1L - N1_prefix - N1_suffix - N1_shape - N1_ne_iob - N1_ne_type - - N2w - N2W - N2p - N2c - N2c4 - N2c6 - N2L - N2_prefix - N2_suffix - N2_shape - N2_ne_iob - N2_ne_type - - P1w - P1W - P1p - P1c - P1c4 - P1c6 - P1L - P1_prefix - P1_suffix - P1_shape - P1_ne_iob - P1_ne_type - - P2w - P2W - P2p - P2c - P2c4 - P2c6 - P2L - P2_prefix - P2_suffix - P2_shape - P2_ne_iob - P2_ne_type - - E0w - E0W - E0p - E0c - E0c4 - E0c6 - E0L - E0_prefix - E0_suffix - E0_shape - E0_ne_iob - E0_ne_type - - E1w - E1W - E1p - E1c - E1c4 - E1c6 - E1L - E1_prefix - E1_suffix - E1_shape - E1_ne_iob - E1_ne_type - - # Misc features at the end - dist - N0lv - S0lv - S0rv - S1lv - S1rv - - S0_has_head - S1_has_head - S2_has_head - - CONTEXT_SIZE diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx deleted file mode 100644 index 2e0db4877..000000000 --- a/spacy/syntax/_parse_features.pyx +++ /dev/null @@ -1,419 +0,0 @@ -""" -Fill an array, context, with every _atomic_ value our features reference. -We then write the _actual features_ as tuples of the atoms. The machinery -that translates from the tuples to feature-extractors (which pick the values -out of "context") is in features/extractor.pyx - -The atomic feature names are listed in a big enum, so that the feature tuples -can refer to them. -""" -# coding: utf-8 -from __future__ import unicode_literals - -from libc.string cimport memset -from itertools import combinations -from cymem.cymem cimport Pool - -from ..structs cimport TokenC -from .stateclass cimport StateClass -from ._state cimport StateC - - -cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: - if token is NULL: - context[0] = 0 - context[1] = 0 - context[2] = 0 - context[3] = 0 - context[4] = 0 - context[5] = 0 - context[6] = 0 - context[7] = 0 - context[8] = 0 - context[9] = 0 - context[10] = 0 - context[11] = 0 - else: - context[0] = token.lex.orth - context[1] = token.lemma - context[2] = token.tag - context[3] = token.lex.cluster - # We've read in the string little-endian, so now we can take & (2**n)-1 - # to get the first n bits of the cluster. - # e.g. s = "1110010101" - # s = ''.join(reversed(s)) - # first_4_bits = int(s, 2) - # print first_4_bits - # 5 - # print "{0:b}".format(prefix).ljust(4, '0') - # 1110 - # What we're doing here is picking a number where all bits are 1, e.g. - # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in - # the source that are set to 1. - context[4] = token.lex.cluster & 15 - context[5] = token.lex.cluster & 63 - context[6] = token.dep if token.head != 0 else 0 - context[7] = token.lex.prefix - context[8] = token.lex.suffix - context[9] = token.lex.shape - context[10] = token.ent_iob - context[11] = token.ent_type - -cdef int fill_context(atom_t* ctxt, const StateC* st) nogil: - # Take care to fill every element of context! - # We could memset, but this makes it very easy to have broken features that - # make almost no impact on accuracy. If instead they're unset, the impact - # tends to be dramatic, so we get an obvious regression to fix... - fill_token(&ctxt[S2w], st.S_(2)) - fill_token(&ctxt[S1w], st.S_(1)) - fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) - fill_token(&ctxt[S0lw], st.L_(st.S(0), 1)) - fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2)) - fill_token(&ctxt[S0w], st.S_(0)) - fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2)) - fill_token(&ctxt[S0rw], st.R_(st.S(0), 1)) - fill_token(&ctxt[N0lw], st.L_(st.B(0), 1)) - fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2)) - fill_token(&ctxt[N0w], st.B_(0)) - fill_token(&ctxt[N1w], st.B_(1)) - fill_token(&ctxt[N2w], st.B_(2)) - fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) - fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) - - fill_token(&ctxt[E0w], st.E_(0)) - fill_token(&ctxt[E1w], st.E_(1)) - - if st.stack_depth() >= 1 and not st.eol(): - ctxt[dist] = min_(st.B(0) - st.E(0), 5) - else: - ctxt[dist] = 0 - ctxt[N0lv] = min_(st.n_L(st.B(0)), 5) - ctxt[S0lv] = min_(st.n_L(st.S(0)), 5) - ctxt[S0rv] = min_(st.n_R(st.S(0)), 5) - ctxt[S1lv] = min_(st.n_L(st.S(1)), 5) - ctxt[S1rv] = min_(st.n_R(st.S(1)), 5) - - ctxt[S0_has_head] = 0 - ctxt[S1_has_head] = 0 - ctxt[S2_has_head] = 0 - if st.stack_depth() >= 1: - ctxt[S0_has_head] = st.has_head(st.S(0)) + 1 - if st.stack_depth() >= 2: - ctxt[S1_has_head] = st.has_head(st.S(1)) + 1 - if st.stack_depth() >= 3: - ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 - - -cdef inline int min_(int a, int b) nogil: - return a if a > b else b - - -ner = ( - (N0W,), - (P1W,), - (N1W,), - (P2W,), - (N2W,), - - (P1W, N0W,), - (N0W, N1W), - - (N0_prefix,), - (N0_suffix,), - - (P1_shape,), - (N0_shape,), - (N1_shape,), - (P1_shape, N0_shape,), - (N0_shape, P1_shape,), - (P1_shape, N0_shape, N1_shape), - (N2_shape,), - (P2_shape,), - - #(P2_norm, P1_norm, W_norm), - #(P1_norm, W_norm, N1_norm), - #(W_norm, N1_norm, N2_norm) - - (P2p,), - (P1p,), - (N0p,), - (N1p,), - (N2p,), - - (P1p, N0p), - (N0p, N1p), - (P2p, P1p, N0p), - (P1p, N0p, N1p), - (N0p, N1p, N2p), - - (P2c,), - (P1c,), - (N0c,), - (N1c,), - (N2c,), - - (P1c, N0c), - (N0c, N1c), - - (E0W,), - (E0c,), - (E0p,), - - (E0W, N0W), - (E0c, N0W), - (E0p, N0W), - - (E0p, P1p, N0p), - (E0c, P1c, N0c), - - (E0w, P1c), - (E0p, P1p), - (E0c, P1c), - (E0p, E1p), - (E0c, P1p), - - (E1W,), - (E1c,), - (E1p,), - - (E0W, E1W), - (E0W, E1p,), - (E0p, E1W,), - (E0p, E1W), - - (P1_ne_iob,), - (P1_ne_iob, P1_ne_type), - (N0w, P1_ne_iob, P1_ne_type), - - (N0_shape,), - (N1_shape,), - (N2_shape,), - (P1_shape,), - (P2_shape,), - - (N0_prefix,), - (N0_suffix,), - - (P1_ne_iob,), - (P2_ne_iob,), - (P1_ne_iob, P2_ne_iob), - (P1_ne_iob, P1_ne_type), - (P2_ne_iob, P2_ne_type), - (N0w, P1_ne_iob, P1_ne_type), - - (N0w, N1w), -) - - -unigrams = ( - (S2W, S2p), - (S2c6, S2p), - - (S1W, S1p), - (S1c6, S1p), - - (S0W, S0p), - (S0c6, S0p), - - (N0W, N0p), - (N0p,), - (N0c,), - (N0c6, N0p), - (N0L,), - - (N1W, N1p), - (N1c6, N1p), - - (N2W, N2p), - (N2c6, N2p), - - (S0r2W, S0r2p), - (S0r2c6, S0r2p), - (S0r2L,), - - (S0rW, S0rp), - (S0rc6, S0rp), - (S0rL,), - - (S0l2W, S0l2p), - (S0l2c6, S0l2p), - (S0l2L,), - - (S0lW, S0lp), - (S0lc6, S0lp), - (S0lL,), - - (N0l2W, N0l2p), - (N0l2c6, N0l2p), - (N0l2L,), - - (N0lW, N0lp), - (N0lc6, N0lp), - (N0lL,), -) - - -s0_n0 = ( - (S0W, S0p, N0W, N0p), - (S0c, S0p, N0c, N0p), - (S0c6, S0p, N0c6, N0p), - (S0c4, S0p, N0c4, N0p), - (S0p, N0p), - (S0W, N0p), - (S0p, N0W), - (S0W, N0c), - (S0c, N0W), - (S0p, N0c), - (S0c, N0p), - (S0W, S0rp, N0p), - (S0p, S0rp, N0p), - (S0p, N0lp, N0W), - (S0p, N0lp, N0p), - (S0L, N0p), - (S0p, S0rL, N0p), - (S0p, N0lL, N0p), - (S0p, S0rv, N0p), - (S0p, N0lv, N0p), - (S0c6, S0rL, S0r2L, N0p), - (S0p, N0lL, N0l2L, N0p), -) - - -s1_s0 = ( - (S1p, S0p), - (S1p, S0p, S0_has_head), - (S1W, S0p), - (S1W, S0p, S0_has_head), - (S1c, S0p), - (S1c, S0p, S0_has_head), - (S1p, S1rL, S0p), - (S1p, S1rL, S0p, S0_has_head), - (S1p, S0lL, S0p), - (S1p, S0lL, S0p, S0_has_head), - (S1p, S0lL, S0l2L, S0p), - (S1p, S0lL, S0l2L, S0p, S0_has_head), - (S1L, S0L, S0W), - (S1L, S0L, S0p), - (S1p, S1L, S0L, S0p), - (S1p, S0p), -) - - -s1_n0 = ( - (S1p, N0p), - (S1c, N0c), - (S1c, N0p), - (S1p, N0c), - (S1W, S1p, N0p), - (S1p, N0W, N0p), - (S1c6, S1p, N0c6, N0p), - (S1L, N0p), - (S1p, S1rL, N0p), - (S1p, S1rp, N0p), -) - - -s0_n1 = ( - (S0p, N1p), - (S0c, N1c), - (S0c, N1p), - (S0p, N1c), - (S0W, S0p, N1p), - (S0p, N1W, N1p), - (S0c6, S0p, N1c6, N1p), - (S0L, N1p), - (S0p, S0rL, N1p), -) - - -n0_n1 = ( - (N0W, N0p, N1W, N1p), - (N0W, N0p, N1p), - (N0p, N1W, N1p), - (N0c, N0p, N1c, N1p), - (N0c6, N0p, N1c6, N1p), - (N0c, N1c), - (N0p, N1c), -) - -tree_shape = ( - (dist,), - (S0p, S0_has_head, S1_has_head, S2_has_head), - (S0p, S0lv, S0rv), - (N0p, N0lv), -) - -trigrams = ( - (N0p, N1p, N2p), - (S0p, S0lp, S0l2p), - (S0p, S0rp, S0r2p), - (S0p, S1p, S2p), - (S1p, S0p, N0p), - (S0p, S0lp, N0p), - (S0p, N0p, N0lp), - (N0p, N0lp, N0l2p), - - (S0W, S0p, S0rL, S0r2L), - (S0p, S0rL, S0r2L), - - (S0W, S0p, S0lL, S0l2L), - (S0p, S0lL, S0l2L), - - (N0W, N0p, N0lL, N0l2L), - (N0p, N0lL, N0l2L), -) - - -words = ( - S2w, - S1w, - S1rw, - S0lw, - S0l2w, - S0w, - S0r2w, - S0rw, - N0lw, - N0l2w, - N0w, - N1w, - N2w, - P1w, - P2w -) - -tags = ( - S2p, - S1p, - S1rp, - S0lp, - S0l2p, - S0p, - S0r2p, - S0rp, - N0lp, - N0l2p, - N0p, - N1p, - N2p, - P1p, - P2p -) - -labels = ( - S2L, - S1L, - S1rL, - S0lL, - S0l2L, - S0L, - S0r2L, - S0rL, - N0lL, - N0l2L, - N0L, - N1L, - N2L, - P1L, - P2L -) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 3da9e5d4c..5470df470 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -1,7 +1,9 @@ -from libc.string cimport memcpy, memset +from libc.string cimport memcpy, memset, memmove from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint32_t, uint64_t +from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno + from murmurhash.mrmr cimport hash64 from ..vocab cimport EMPTY_LEXEME @@ -15,6 +17,23 @@ from ..typedefs cimport attr_t cdef inline bint is_space_token(const TokenC* token) nogil: return Lexeme.c_check_flag(token.lex, IS_SPACE) +cdef struct RingBufferC: + int[8] data + int i + int default + +cdef inline int ring_push(RingBufferC* ring, int value) nogil: + ring.data[ring.i] = value + ring.i += 1 + if ring.i >= 8: + ring.i = 0 + +cdef inline int ring_get(RingBufferC* ring, int i) nogil: + if i >= ring.i: + return ring.default + else: + return ring.data[ring.i-i] + cdef cppclass StateC: int* _stack @@ -23,6 +42,7 @@ cdef cppclass StateC: TokenC* _sent Entity* _ents TokenC _empty_token + RingBufferC _hist int length int offset int _s_i @@ -37,6 +57,12 @@ cdef cppclass StateC: this.shifted = calloc(length + (PADDING * 2), sizeof(bint)) this._sent = calloc(length + (PADDING * 2), sizeof(TokenC)) this._ents = calloc(length + (PADDING * 2), sizeof(Entity)) + if not (this._buffer and this._stack and this.shifted + and this._sent and this._ents): + with gil: + PyErr_SetFromErrno(MemoryError) + PyErr_CheckSignals() + memset(&this._hist, 0, sizeof(this._hist)) this.offset = 0 cdef int i for i in range(length + (PADDING * 2)): @@ -74,6 +100,9 @@ cdef cppclass StateC: free(this.shifted - PADDING) void set_context_tokens(int* ids, int n) nogil: + if n == 2: + ids[0] = this.B(0) + ids[1] = this.S(0) if n == 8: ids[0] = this.B(0) ids[1] = this.B(1) @@ -81,7 +110,7 @@ cdef cppclass StateC: ids[3] = this.S(1) ids[4] = this.H(this.S(0)) ids[5] = this.L(this.B(0), 1) - ids[6] = this.L(this.S(0), 2) + ids[6] = this.L(this.S(0), 1) ids[7] = this.R(this.S(0), 1) elif n == 13: ids[0] = this.B(0) @@ -101,9 +130,10 @@ cdef cppclass StateC: elif n == 6: if this.B(0) >= 0: ids[0] = this.B(0) + ids[1] = this.B(0)-1 else: ids[0] = -1 - ids[1] = this.B(0) + ids[1] = -1 ids[2] = this.B(1) ids[3] = this.E(0) if ids[3] >= 1: @@ -120,6 +150,8 @@ cdef cppclass StateC: for i in range(n): if ids[i] >= 0: ids[i] += this.offset + else: + ids[i] = -1 int S(int i) nogil const: if i >= this._s_i: @@ -162,9 +194,9 @@ cdef cppclass StateC: int E(int i) nogil const: if this._e_i <= 0 or this._e_i >= this.length: - return 0 + return -1 if i < 0 or i >= this._e_i: - return 0 + return -1 return this._ents[this._e_i - (i+1)].start int L(int i, int idx) nogil const: @@ -268,13 +300,22 @@ cdef cppclass StateC: sig[8] = this.B_(0)[0] sig[9] = this.E_(0)[0] sig[10] = this.E_(1)[0] - return hash64(sig, sizeof(sig), this._s_i) + return hash64(sig, sizeof(sig), this._s_i) \ + + hash64(&this._hist, sizeof(RingBufferC), 1) + + void push_hist(int act) nogil: + ring_push(&this._hist, act+1) + + int get_hist(int i) nogil: + return ring_get(&this._hist, i) void push() nogil: if this.B(0) != -1: this._stack[this._s_i] = this.B(0) this._s_i += 1 this._b_i += 1 + if this.B_(0).sent_start == 1: + this.set_break(this.B(0)) if this._b_i > this._break: this._break = -1 @@ -351,7 +392,7 @@ cdef cppclass StateC: void set_break(int i) nogil: if 0 <= i < this.length: - this._sent[i].sent_start = True + this._sent[i].sent_start = 1 this._break = this._b_i void clone(const StateC* src) nogil: diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index 83c831f0b..e69de29bb 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -1 +0,0 @@ -# test diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index d1e1987d7..b3c9b5563 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -4,24 +4,16 @@ # coding: utf-8 from __future__ import unicode_literals -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -import ctypes -from libc.stdint cimport uint32_t -from libc.string cimport memcpy +from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from collections import OrderedDict from thinc.extra.search cimport Beam -import numpy from .stateclass cimport StateClass -from ._state cimport StateC, is_space_token +from ._state cimport StateC from .nonproj import is_nonproj_tree -from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold cimport GoldParse -from ..gold cimport GoldParseC -from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT -from ..lexeme cimport Lexeme +from ..gold cimport GoldParse, GoldParseC from ..structs cimport TokenC @@ -118,7 +110,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: cdef class Shift: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -178,7 +170,7 @@ cdef class Reduce: cdef class LeftArc: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return not st.B_(0).sent_start + return st.B_(0).sent_start != 1 @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -212,7 +204,8 @@ cdef class LeftArc: cdef class RightArc: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return not st.B_(0).sent_start + # If there's (perhaps partial) parse pre-set, don't allow cycle. + return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0) @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -248,6 +241,10 @@ cdef class Break: return False elif st.stack_depth() < 1: return False + elif st.B_(0).l_edge < 0: + return False + elif st._sent[st.B_(0).l_edge].sent_start < 0: + return False else: return True @@ -311,14 +308,13 @@ cdef class ArcEager(TransitionSystem): @classmethod def get_actions(cls, **kwargs): - actions = kwargs.get('actions', - OrderedDict(( - (SHIFT, ['']), - (REDUCE, ['']), - (RIGHT, []), - (LEFT, []), - (BREAK, ['ROOT']) - ))) + actions = kwargs.get('actions', OrderedDict(( + (SHIFT, ['']), + (REDUCE, ['']), + (RIGHT, []), + (LEFT, []), + (BREAK, ['ROOT'])) + )) seen_actions = set() for label in kwargs.get('left_labels', []): if label.upper() != 'ROOT': @@ -358,7 +354,8 @@ cdef class ArcEager(TransitionSystem): if gold.cand_to_gold[i] is None: continue if state.safe_get(i).dep: - predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) + predicted.add((i, state.H(i), + self.strings[state.safe_get(i).dep])) else: predicted.add((i, state.H(i), 'ROOT')) id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] @@ -376,7 +373,8 @@ cdef class ArcEager(TransitionSystem): if not self.has_gold(gold): return None for i in range(gold.length): - if gold.heads[i] is None or gold.labels[i] is None: # Missing values + # Missing values + if gold.heads[i] is None or gold.labels[i] is None: gold.c.heads[i] = i gold.c.has_dep[i] = False else: @@ -442,14 +440,19 @@ cdef class ArcEager(TransitionSystem): cdef int initialize_state(self, StateC* st) nogil: for i in range(st.length): - st._sent[i].l_edge = i - st._sent[i].r_edge = i + if st._sent[i].dep == 0: + st._sent[i].l_edge = i + st._sent[i].r_edge = i + st._sent[i].head = 0 + st._sent[i].dep = 0 + st._sent[i].l_kids = 0 + st._sent[i].r_kids = 0 st.fast_forward() cdef int finalize_state(self, StateC* st) nogil: cdef int i for i in range(st.length): - if st._sent[i].head == 0 and st._sent[i].dep == 0: + if st._sent[i].head == 0: st._sent[i].dep = self.root_label def finalize_doc(self, doc): @@ -507,14 +510,15 @@ cdef class ArcEager(TransitionSystem): # Check projectivity --- leading cause if is_nonproj_tree(gold.heads): raise ValueError( - "Could not find a gold-standard action to supervise the dependency " - "parser.\n" - "Likely cause: the tree is non-projective (i.e. it has crossing " - "arcs -- see spacy/syntax/nonproj.pyx for definitions)\n" - "The ArcEager transition system only supports projective trees.\n" - "To learn non-projective representations, transform the data " - "before training and after parsing. Either pass make_projective=True " - "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data") + "Could not find a gold-standard action to supervise the " + "dependency parser. Likely cause: the tree is " + "non-projective (i.e. it has crossing arcs -- see " + "spacy/syntax/nonproj.pyx for definitions). The ArcEager " + "transition system only supports projective trees. To " + "learn non-projective representations, transform the data " + "before training and after parsing. Either pass " + "make_projective=True to the GoldParse class, or use " + "spacy.syntax.nonproj.preprocess_training_data.") else: print(gold.orig_annot) print(gold.words) @@ -522,12 +526,10 @@ cdef class ArcEager(TransitionSystem): print(gold.labels) print(gold.sent_starts) raise ValueError( - "Could not find a gold-standard action to supervise the dependency " - "parser.\n" - "The GoldParse was projective.\n" - "The transition system has %d actions.\n" - "State at failure:\n" - "%s" % (self.n_moves, stcls.print_state(gold.words))) + "Could not find a gold-standard action to supervise the" + "dependency parser. The GoldParse was projective. The " + "transition system has %d actions. State at failure: %s" + % (self.n_moves, stcls.print_state(gold.words))) assert n_gold >= 1 def get_beam_annot(self, Beam beam): @@ -548,4 +550,3 @@ cdef class ArcEager(TransitionSystem): deps[j].setdefault(dep, 0.0) deps[j][dep] += prob return heads, deps - diff --git a/spacy/syntax/beam_parser.pxd b/spacy/syntax/beam_parser.pxd deleted file mode 100644 index 35a60cbf3..000000000 --- a/spacy/syntax/beam_parser.pxd +++ /dev/null @@ -1,10 +0,0 @@ -from .parser cimport Parser -from ..structs cimport TokenC -from thinc.typedefs cimport weight_t - - -cdef class BeamParser(Parser): - cdef public int beam_width - cdef public weight_t beam_density - - cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1 diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx deleted file mode 100644 index 68e9f27af..000000000 --- a/spacy/syntax/beam_parser.pyx +++ /dev/null @@ -1,239 +0,0 @@ -""" -MALT-style dependency parser -""" -# cython: profile=True -# cython: experimental_cpp_class_def=True -# cython: cdivision=True -# cython: infer_types=True -# coding: utf-8 - -from __future__ import unicode_literals, print_function -cimport cython - -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport rand -from libc.math cimport log, exp, isnan, isinf -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport real_hash64 as hash64 -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.features cimport ConjunctionExtracter -from thinc.structs cimport FeatureC, ExampleC -from thinc.extra.search cimport Beam, MaxViolation -from thinc.extra.eg cimport Example -from thinc.extra.mb cimport Minibatch - -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParse -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context -from .stateclass cimport StateClass -from .parser cimport Parser - - -DEBUG = False -def set_debug(val): - global DEBUG - DEBUG = val - - -def get_templates(name): - pf = _parse_features - if name == 'ner': - return pf.ner - elif name == 'debug': - return pf.unigrams - else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ - pf.tree_shape + pf.trigrams) - - -cdef int BEAM_WIDTH = 16 -cdef weight_t BEAM_DENSITY = 0.001 - -cdef class BeamParser(Parser): - def __init__(self, *args, **kwargs): - self.beam_width = kwargs.get('beam_width', BEAM_WIDTH) - self.beam_density = kwargs.get('beam_density', BEAM_DENSITY) - Parser.__init__(self, *args, **kwargs) - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: - with gil: - self._parseC(tokens, length, nr_feat, self.moves.n_moves) - - cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1: - cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density) - # TODO: How do we handle new labels here? This increases nr_class - beam.initialize(self.moves.init_beam_state, length, tokens) - beam.check_done(_check_final_state, NULL) - if beam.is_done: - _cleanup(beam) - return 0 - while not beam.is_done: - self._advance_beam(beam, None, False) - state = beam.at(0) - self.moves.finalize_state(state.c) - for i in range(length): - tokens[i] = state.c._sent[i] - _cleanup(beam) - - def update(self, Doc tokens, GoldParse gold_parse, itn=0): - self.moves.preprocess_gold(gold_parse) - cdef Beam pred = Beam(self.moves.n_moves, self.beam_width) - pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c) - pred.check_done(_check_final_state, NULL) - # Hack for NER - for i in range(pred.size): - stcls = pred.at(i) - self.moves.initialize_state(stcls.c) - - cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0) - gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c) - gold.check_done(_check_final_state, NULL) - violn = MaxViolation() - while not pred.is_done and not gold.is_done: - # We search separately here, to allow for ambiguity in the gold parse. - self._advance_beam(pred, gold_parse, False) - self._advance_beam(gold, gold_parse, True) - violn.check_crf(pred, gold) - if pred.loss > 0 and pred.min_score > (gold.score + self.model.time): - break - else: - # The non-monotonic oracle makes it difficult to ensure final costs are - # correct. Therefore do final correction - for i in range(pred.size): - if self.moves.is_gold_parse(pred.at(i), gold_parse): - pred._states[i].loss = 0.0 - elif pred._states[i].loss == 0.0: - pred._states[i].loss = 1.0 - violn.check_crf(pred, gold) - if pred.size < 1: - raise Exception("No candidates", tokens.length) - if gold.size < 1: - raise Exception("No gold", tokens.length) - if pred.loss == 0: - self.model.update_from_histories(self.moves, tokens, [(0.0, [])]) - elif True: - #_check_train_integrity(pred, gold, gold_parse, self.moves) - histories = list(zip(violn.p_probs, violn.p_hist)) + \ - list(zip(violn.g_probs, violn.g_hist)) - self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1)) - else: - self.model.update_from_histories(self.moves, tokens, - [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])]) - _cleanup(pred) - _cleanup(gold) - return pred.loss - - def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): - cdef atom_t[CONTEXT_SIZE] context - cdef Pool mem = Pool() - features = mem.alloc(self.model.nr_feat, sizeof(FeatureC)) - if False: - mb = Minibatch(self.model.widths, beam.size) - for i in range(beam.size): - stcls = beam.at(i) - if stcls.c.is_final(): - nr_feat = 0 - else: - nr_feat = self.model.set_featuresC(context, features, stcls.c) - self.moves.set_valid(beam.is_valid[i], stcls.c) - mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0) - self.model(mb) - for i in range(beam.size): - memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0])) - else: - for i in range(beam.size): - stcls = beam.at(i) - if not stcls.is_final(): - nr_feat = self.model.set_featuresC(context, features, stcls.c) - self.moves.set_valid(beam.is_valid[i], stcls.c) - self.model.set_scoresC(beam.scores[i], features, nr_feat) - if gold is not None: - n_gold = 0 - lines = [] - for i in range(beam.size): - stcls = beam.at(i) - if not stcls.c.is_final(): - self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold) - if follow_gold: - for j in range(self.moves.n_moves): - if beam.costs[i][j] >= 1: - beam.is_valid[i][j] = 0 - lines.append((stcls.B(0), stcls.B(1), - stcls.B_(0).ent_iob, stcls.B_(1).ent_iob, - stcls.B_(1).sent_start, - j, - beam.is_valid[i][j], 'set invalid', - beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label)) - n_gold += 1 if beam.is_valid[i][j] else 0 - if follow_gold and n_gold == 0: - raise Exception("No gold") - if follow_gold: - beam.advance(_transition_state, NULL, self.moves.c) - else: - beam.advance(_transition_state, _hash_state, self.moves.c) - beam.check_done(_check_final_state, NULL) - - -# These are passed as callbacks to thinc.search.Beam -cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src - moves = _moves - dest.clone(src) - moves[clas].do(dest.c, moves[clas].label) - - -cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) - - -cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - if state.c.is_final(): - return 1 - else: - return state.c.hash() - - -def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves): - for i in range(pred.size): - if not pred._states[i].is_done or pred._states[i].loss == 0: - continue - state = pred.at(i) - if moves.is_gold_parse(state, gold_parse) == True: - for dep in gold_parse.orig_annot: - print(dep[1], dep[3], dep[4]) - print("Cost", pred._states[i].loss) - for j in range(gold_parse.length): - print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) - acts = [moves.c[clas].move for clas in pred.histories[i]] - labels = [moves.c[clas].label for clas in pred.histories[i]] - print([moves.move_name(move, label) for move, label in zip(acts, labels)]) - raise Exception("Predicted state is gold-standard") - for i in range(gold.size): - if not gold._states[i].is_done: - continue - state = gold.at(i) - if moves.is_gold(state, gold_parse) == False: - print("Truth") - for dep in gold_parse.orig_annot: - print(dep[1], dep[3], dep[4]) - print("Predicted good") - for j in range(gold_parse.length): - print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) - raise Exception("Gold parse is not gold-standard") - - diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx deleted file mode 100644 index 557616d18..000000000 --- a/spacy/syntax/iterators.pyx +++ /dev/null @@ -1,144 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX - - -def english_noun_chunks(obj): - """ - Detect base noun phrases from a dependency parse. - Works on both Doc and Span. - """ - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', - 'attr', 'ROOT'] - doc = obj.doc # Ensure works on both Doc and Span. - np_deps = [doc.vocab.strings.add(label) for label in labels] - conj = doc.vocab.strings.add('conj') - np_label = doc.vocab.strings.add('NP') - seen = set() - for i, word in enumerate(obj): - if word.pos not in (NOUN, PROPN, PRON): - continue - # Prevent nested chunks from being produced - if word.i in seen: - continue - if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i+1)) - yield word.left_edge.i, word.i+1, np_label - elif word.dep == conj: - head = word.head - while head.dep == conj and head.head.i < head.i: - head = head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i+1)) - yield word.left_edge.i, word.i+1, np_label - - -# this iterator extracts spans headed by NOUNs starting from the left-most -# syntactic dependent until the NOUN itself -# for close apposition and measurement construction, the span is sometimes -# extended to the right of the NOUN -# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not -# just "eine Tasse", same for "das Thema Familie" -def german_noun_chunks(obj): - labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] - doc = obj.doc # Ensure works on both Doc and Span. - np_label = doc.vocab.strings.add('NP') - np_deps = set(doc.vocab.strings.add(label) for label in labels) - close_app = doc.vocab.strings.add('nk') - - rbracket = 0 - for i, word in enumerate(obj): - if i < rbracket: - continue - if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: - rbracket = word.i+1 - # try to extend the span to the right - # to capture close apposition/measurement constructions - for rdep in doc[word.i].rights: - if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app: - rbracket = rdep.i+1 - yield word.left_edge.i, rbracket, np_label - - -def es_noun_chunks(obj): - doc = obj.doc - np_label = doc.vocab.strings['NP'] - left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed'] - right_labels = ['flat', 'fixed', 'compound', 'neg'] - stop_labels = ['punct'] - np_left_deps = [doc.vocab.strings[label] for label in left_labels] - np_right_deps = [doc.vocab.strings[label] for label in right_labels] - stop_deps = [doc.vocab.strings[label] for label in stop_labels] - - def next_token(token): - try: - return token.nbor() - except: - return None - - def noun_bounds(root): - def is_verb_token(token): - return token.pos in [VERB, AUX] - - left_bound = root - for token in reversed(list(root.lefts)): - if token.dep in np_left_deps: - left_bound = token - right_bound = root - for token in root.rights: - if (token.dep in np_right_deps): - left, right = noun_bounds(token) - if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, - doc[left_bound.i: right.i])): - break - else: - right_bound = right - return left_bound, right_bound - - token = doc[0] - while token and token.i < len(doc): - if token.pos in [PROPN, NOUN, PRON]: - left, right = noun_bounds(token) - yield left.i, right.i+1, np_label - token = right - token = next_token(token) - - -def french_noun_chunks(obj): - labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] - doc = obj.doc # Ensure works on both Doc and Span. - np_deps = [doc.vocab.strings[label] for label in labels] - conj = doc.vocab.strings.add('conj') - np_label = doc.vocab.strings.add('NP') - seen = set() - for i, word in enumerate(obj): - if word.pos not in (NOUN, PROPN, PRON): - continue - # Prevent nested chunks from being produced - if word.i in seen: - continue - if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) - yield word.left_edge.i, word.right_edge.i+1, np_label - elif word.dep == conj: - head = word.head - while head.dep == conj and head.head.i < head.i: - head = head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) - yield word.left_edge.i, word.right_edge.i+1, np_label - - -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, - 'es': es_noun_chunks, 'fr': french_noun_chunks} diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 2f5cd4e48..e2e242aea 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -4,17 +4,12 @@ from __future__ import unicode_literals from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam from collections import OrderedDict -import numpy -from thinc.neural.ops import NumpyOps from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t -from ..structs cimport TokenC, Entity -from ..gold cimport GoldParseC -from ..gold cimport GoldParse -from ..attrs cimport ENT_TYPE, ENT_IOB +from ..gold cimport GoldParseC, GoldParse cdef enum: @@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem): @classmethod def get_actions(cls, **kwargs): - actions = kwargs.get('actions', - OrderedDict(( - (MISSING, ['']), - (BEGIN, []), - (IN, []), - (LAST, []), - (UNIT, []), - (OUT, ['']) - ))) + actions = kwargs.get('actions', OrderedDict(( + (MISSING, ['']), + (BEGIN, []), + (IN, []), + (LAST, []), + (UNIT, []), + (OUT, ['']) + ))) seen_entities = set() for entity_type in kwargs.get('entity_types', []): if entity_type in seen_entities: @@ -160,9 +154,8 @@ cdef class BiluoPushDown(TransitionSystem): cdef Transition lookup_transition(self, object name) except *: cdef attr_t label - if name == '-' or name == None: - move_str = 'M' - label = 0 + if name == '-' or name is None: + return Transition(clas=0, move=MISSING, label=0, score=0) elif name == '!O': return Transition(clas=0, move=ISNT, label=0, score=0) elif '-' in name: @@ -220,6 +213,29 @@ cdef class BiluoPushDown(TransitionSystem): raise Exception(move) return t + def add_action(self, int action, label_name): + cdef attr_t label_id + if not isinstance(label_name, (int, long)): + label_id = self.strings.add(label_name) + else: + label_id = label_name + if action == OUT and label_id != 0: + return + if action == MISSING or action == ISNT: + return + # Check we're not creating a move we already have, so that this is + # idempotent + for trans in self.c[:self.n_moves]: + if trans.move == action and trans.label == label_id: + return 0 + if self.n_moves >= self._size: + self._size *= 2 + self.c = self.mem.realloc(self.c, self._size * sizeof(self.c[0])) + self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) + assert self.c[self.n_moves].label == label_id + self.n_moves += 1 + return 1 + cdef int initialize_state(self, StateC* st) nogil: # This is especially necessary when we use limited training data. for i in range(st.length): @@ -306,8 +322,8 @@ cdef class In: return False elif preset_ent_iob == 3: return False - # TODO: Is this quite right? - # I think it's supposed to be ensuring the gazetteer matches are maintained + # TODO: Is this quite right? I think it's supposed to be ensuring the + # gazetteer matches are maintained elif st.B_(1).ent_iob != preset_ent_iob: return False # Don't allow entities to extend across sentence boundaries @@ -332,10 +348,12 @@ cdef class In: if g_act == MISSING: return 0 elif g_act == BEGIN: - # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) + # I, Gold B --> True + # (P of bad open entity sunk, R of this entity sunk) return 0 elif g_act == IN: - # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) + # I, Gold I --> True + # (label forced by prev, if mismatch, P and R both sunk) return 0 elif g_act == LAST: # I, Gold L --> True iff this entity sunk and next tag == O @@ -483,11 +501,3 @@ cdef class Out: return 1 else: return 1 - - -class OracleError(Exception): - pass - - -class UnknownMove(Exception): - pass diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 524718965..56615c6f1 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -13,9 +13,9 @@ cdef class Parser: cdef public object model cdef readonly TransitionSystem moves cdef readonly object cfg + cdef public object _multitasks - cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat, int nr_piece) nogil - - #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil + cdef void _parseC(self, StateC* state, + const float* feat_weights, const float* bias, + const float* hW, const float* hb, + int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 552ea4f8f..e480bd1dc 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,85 +1,59 @@ # cython: infer_types=True -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # coding: utf-8 from __future__ import unicode_literals, print_function -from collections import Counter, OrderedDict +from collections import OrderedDict import ujson -import contextlib - -from libc.math cimport exp -cimport cython +import json +import numpy cimport cython.parallel import cytoolz -import dill - import numpy.random cimport numpy as np - +from cpython.ref cimport PyObject, Py_XDECREF +from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno +from libc.math cimport exp from libcpp.vector cimport vector -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport malloc, calloc, free -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport VecVec -from thinc.structs cimport SparseArrayC, FeatureC, ExampleC -from thinc.extra.eg cimport Example +from libc.string cimport memset +from libc.stdlib cimport calloc, free +from cymem.cymem cimport Pool +from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.extra.search cimport Beam - -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport hash64 -from preshed.maps cimport MapStruct -from preshed.maps cimport map_get - -from thinc.api import layerize, chain, noop, clone, with_flatten -from thinc.neural import Model, Affine, ReLu, Maxout -from thinc.neural._classes.batchnorm import BatchNorm as BN -from thinc.neural._classes.selu import SELU -from thinc.neural._classes.layernorm import LayerNorm -from thinc.neural.ops import NumpyOps, CupyOps +from thinc.api import chain, clone +from thinc.v2v import Model, Maxout, Affine +from thinc.misc import LayerNorm +from thinc.neural.ops import CupyOps from thinc.neural.util import get_array_module +from thinc.linalg cimport Vec, VecVec +from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten +from .._ml import link_vectors_to_models +from ..compat import json_dumps, copy_array +from ..tokens.doc cimport Doc +from ..gold cimport GoldParse from .. import util -from ..util import get_async, get_cuda_stream -from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts -from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune -from .._ml import Residual, drop_layer -from ..compat import json_dumps - -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC -from . import nonproj -from .transition_system import OracleError -from .transition_system cimport TransitionSystem, Transition -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from ..gold cimport GoldParse -from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG -from . import _beam_utils +from .transition_system cimport Transition +from . import _beam_utils, nonproj -USE_FINE_TUNE = True def get_templates(*args, **kwargs): return [] -USE_FTRL = True + DEBUG = False + + def set_debug(val): global DEBUG DEBUG = val cdef class precompute_hiddens: - '''Allow a model to be "primed" by pre-computing input features in bulk. + """Allow a model to be "primed" by pre-computing input features in bulk. This is used for the parser, where we want to take a batch of documents, and compute vectors for each (token, position) pair. These vectors can then @@ -94,16 +68,18 @@ cdef class precompute_hiddens: so we can save the factor k. This also gives a nice CPU/GPU division: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. - ''' + """ cdef int nF, nO, nP cdef bint _is_synchronized cdef public object ops cdef np.ndarray _features cdef np.ndarray _cached + cdef np.ndarray bias cdef object _cuda_stream cdef object _bp_hiddens - def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.): + def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, + drop=0.): gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) cdef np.ndarray cached if not isinstance(gpu_cached, numpy.ndarray): @@ -114,17 +90,17 @@ cdef class precompute_hiddens: else: cached = gpu_cached self.nF = cached.shape[1] - self.nO = cached.shape[2] self.nP = getattr(lower_model, 'nP', 1) + self.nO = cached.shape[2] self.ops = lower_model.ops + self.bias = lower_model.b self._is_synchronized = False self._cuda_stream = cuda_stream self._cached = cached self._bp_hiddens = bp_features cdef const float* get_feat_weights(self) except NULL: - if not self._is_synchronized \ - and self._cuda_stream is not None: + if not self._is_synchronized and self._cuda_stream is not None: self._cuda_stream.synchronize() self._is_synchronized = True return self._cached.data @@ -133,7 +109,8 @@ cdef class precompute_hiddens: return self.begin_update(X)[0] def begin_update(self, token_ids, drop=0.): - cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f') + cdef np.ndarray state_vector = numpy.zeros( + (token_ids.shape[0], self.nO, self.nP), dtype='f') # This is tricky, but (assuming GPU available); # - Input to forward on CPU # - Output from forward on CPU @@ -146,13 +123,13 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) + state_vector += self.bias state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector, sgd=None): - if bp_nonlinearity is not None: - d_state_vector = bp_nonlinearity(d_state_vector, sgd) + d_state_vector = bp_nonlinearity(d_state_vector, sgd) # This will usually be on GPU - if isinstance(d_state_vector, numpy.ndarray): + if not isinstance(d_state_vector, self.ops.xp.ndarray): d_state_vector = self.ops.xp.array(d_state_vector) d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) return d_tokens @@ -160,26 +137,34 @@ cdef class precompute_hiddens: def _nonlinearity(self, state_vector): if self.nP == 1: - return state_vector, None - state_vector = state_vector.reshape( - (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) - best, which = self.ops.maxout(state_vector) - def backprop(d_best, sgd=None): - return self.ops.backprop_maxout(d_best, which, self.nP) - return best, backprop + state_vector = state_vector.reshape(state_vector.shape[:-1]) + mask = state_vector >= 0. + state_vector *= mask + else: + state_vector, mask = self.ops.maxout(state_vector) + def backprop_nonlinearity(d_best, sgd=None): + if self.nP == 1: + d_best *= mask + d_best = d_best.reshape((d_best.shape + (1,))) + return d_best + else: + return self.ops.backprop_maxout(d_best, mask, self.nP) + return state_vector, backprop_nonlinearity cdef void sum_state_features(float* output, const float* cached, const int* token_ids, int B, int F, int O) nogil: cdef int idx, b, f, i cdef const float* feature + padding = cached - (F * O) for b in range(B): for f in range(F): if token_ids[f] < 0: - continue - idx = token_ids[f] * F * O + f*O - feature = &cached[idx] + feature = &padding[f*O] + else: + idx = token_ids[f] * F * O + f*O + feature = &cached[idx] for i in range(O): output[i] += feature[i] output += O @@ -238,65 +223,64 @@ cdef class Parser: Base class of the DependencyParser and EntityRecognizer. """ @classmethod - def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): - depth = util.env_opt('parser_hidden_depth', depth) - token_vector_width = util.env_opt('token_vector_width', token_vector_width) - hidden_width = util.env_opt('hidden_width', hidden_width) - parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) - embed_size = util.env_opt('embed_size', 4000) - tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, - preprocess=doc2feats())) - if parser_maxout_pieces == 1: - lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, - nF=cls.nr_feature, - nI=token_vector_width) - else: - lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class, - nF=cls.nr_feature, - nP=parser_maxout_pieces, - nI=token_vector_width) + def Model(cls, nr_class, **cfg): + depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) + if depth != 1: + raise ValueError("Currently parser depth is hard-coded to 1.") + parser_maxout_pieces = util.env_opt('parser_maxout_pieces', + cfg.get('maxout_pieces', 2)) + token_vector_width = util.env_opt('token_vector_width', + cfg.get('token_vector_width', 128)) + hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) + embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) + hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) + hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) + if hist_size != 0: + raise ValueError("Currently history size is hard-coded to 0") + if hist_width != 0: + raise ValueError("Currently history width is hard-coded to 0") + tok2vec = Tok2Vec(token_vector_width, embed_size, + pretrained_dims=cfg.get('pretrained_dims', 0)) + tok2vec = chain(tok2vec, flatten) + lower = PrecomputableAffine(hidden_width, + nF=cls.nr_feature, nI=token_vector_width, + nP=parser_maxout_pieces) + lower.nP = parser_maxout_pieces with Model.use_device('cpu'): - if depth == 0: - upper = chain() - upper.is_noop = True - else: - upper = chain( - clone(Maxout(hidden_width), (depth-1)), - zero_init(Affine(nr_class, drop_factor=0.0)) - ) - upper.is_noop = False + upper = chain( + clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1), + zero_init(Affine(nr_class, hidden_width, drop_factor=0.0)) + ) + # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) - upper.begin_training(upper.ops.allocate((500, hidden_width))) cfg = { 'nr_class': nr_class, - 'depth': depth, + 'hidden_depth': depth, 'token_vector_width': token_vector_width, 'hidden_width': hidden_width, - 'maxout_pieces': parser_maxout_pieces + 'maxout_pieces': parser_maxout_pieces, + 'hist_size': hist_size, + 'hist_width': hist_width } - return (tensors, lower, upper), cfg + return (tok2vec, lower, upper), cfg def __init__(self, Vocab vocab, moves=True, model=True, **cfg): - """ - Create a Parser. + """Create a Parser. - Arguments: - vocab (Vocab): - The vocabulary object. Must be shared with documents to be processed. - The value is set to the .vocab attribute. - moves (TransitionSystem): - Defines how the parse-state is created, updated and evaluated. - The value is set to the .moves attribute unless True (default), - in which case a new instance is created with Parser.Moves(). - model (object): - Defines how the parse-state is created, updated and evaluated. - The value is set to the .model attribute unless True (default), - in which case a new instance is created with Parser.Model(). - **cfg: - Arbitrary configuration parameters. Set to the .cfg attribute + vocab (Vocab): The vocabulary object. Must be shared with documents + to be processed. The value is set to the `.vocab` attribute. + moves (TransitionSystem): Defines how the parse-state is created, + updated and evaluated. The value is set to the .moves attribute + unless True (default), in which case a new instance is created with + `Parser.Moves()`. + model (object): Defines how the parse-state is created, updated and + evaluated. The value is set to the .model attribute unless True + (default), in which case a new instance is created with + `Parser.Model()`. + **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute """ self.vocab = vocab if moves is True: @@ -307,24 +291,25 @@ cdef class Parser: cfg['beam_width'] = util.env_opt('beam_width', 1) if 'beam_density' not in cfg: cfg['beam_density'] = util.env_opt('beam_density', 0.0) + if 'pretrained_dims' not in cfg: + cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] + cfg.setdefault('cnn_maxout_pieces', 3) self.cfg = cfg if 'actions' in self.cfg: for action, labels in self.cfg.get('actions', {}).items(): for label in labels: self.moves.add_action(action, label) self.model = model + self._multitasks = [] def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) def __call__(self, Doc doc, beam_width=None, beam_density=None): - """ - Apply the parser or entity recognizer, setting the annotations onto the Doc object. + """Apply the parser or entity recognizer, setting the annotations onto + the `Doc` object. - Arguments: - doc (Doc): The document to be processed. - Returns: - None + doc (Doc): The document to be processed. """ if beam_width is None: beam_width = self.cfg.get('beam_width', 1) @@ -332,11 +317,11 @@ cdef class Parser: beam_density = self.cfg.get('beam_density', 0.0) cdef Beam beam if beam_width == 1: - states = self.parse_batch([doc], [doc.tensor]) + states = self.parse_batch([doc]) self.set_annotations([doc], states) return doc else: - beam = self.beam_parse([doc], [doc.tensor], + beam = self.beam_parse([doc], beam_width=beam_width, beam_density=beam_density)[0] output = self.moves.get_beam_annot(beam) state = beam.at(0) @@ -344,18 +329,15 @@ cdef class Parser: _cleanup(beam) return output - def pipe(self, docs, int batch_size=1000, int n_threads=2, + def pipe(self, docs, int batch_size=256, int n_threads=2, beam_width=None, beam_density=None): - """ - Process a stream of documents. + """Process a stream of documents. - Arguments: - stream: The sequence of documents to process. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel. - Yields (Doc): Documents, in order. + stream: The sequence of documents to process. + batch_size (int): Number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel. + YIELDS (Doc): Documents, in order. """ if beam_width is None: beam_width = self.cfg.get('beam_width', 1) @@ -363,100 +345,124 @@ cdef class Parser: beam_density = self.cfg.get('beam_density', 0.0) cdef Doc doc cdef Beam beam - for docs in cytoolz.partition_all(batch_size, docs): - docs = list(docs) - tokvecs = [doc.tensor for doc in docs] - if beam_width == 1: - parse_states = self.parse_batch(docs, tokvecs) - else: - beams = self.beam_parse(docs, tokvecs, - beam_width=beam_width, beam_density=beam_density) - parse_states = [] - for beam in beams: - parse_states.append(beam.at(0)) - self.set_annotations(docs, parse_states) - yield from docs + for batch in cytoolz.partition_all(batch_size, docs): + batch = list(batch) + by_length = sorted(list(batch), key=lambda doc: len(doc)) + for subbatch in cytoolz.partition_all(8, by_length): + subbatch = list(subbatch) + if beam_width == 1: + parse_states = self.parse_batch(subbatch) + beams = [] + else: + beams = self.beam_parse(subbatch, beam_width=beam_width, + beam_density=beam_density) + parse_states = [] + for beam in beams: + parse_states.append(beam.at(0)) + self.set_annotations(subbatch, parse_states) + yield from batch - def parse_batch(self, docs, tokvecses): + def parse_batch(self, docs): cdef: precompute_hiddens state2vec - StateClass state + StateClass stcls Pool mem const float* feat_weights StateC* st - vector[StateC*] next_step, this_step - int nr_class, nr_feat, nr_piece, nr_dim, nr_state + vector[StateC*] states + int guess, nr_class, nr_feat, nr_piece, nr_dim, nr_state, nr_step + int j if isinstance(docs, Doc): docs = [docs] - if isinstance(tokvecses, np.ndarray): - tokvecses = [tokvecses] - - tokvecs = self.model[0].ops.flatten(tokvecses) - if USE_FINE_TUNE: - tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) + cuda_stream = util.get_cuda_stream() + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( + docs, cuda_stream, 0.0) nr_state = len(docs) nr_class = self.moves.n_moves nr_dim = tokvecs.shape[1] nr_feat = self.nr_feature - - cuda_stream = get_cuda_stream() - state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs, - cuda_stream, 0.0) nr_piece = state2vec.nP - states = self.moves.init_batch(docs) - for state in states: - if not state.c.is_final(): - next_step.push_back(state.c) + state_objs = self.moves.init_batch(docs) + for stcls in state_objs: + if not stcls.c.is_final(): + states.push_back(stcls.c) feat_weights = state2vec.get_feat_weights() cdef int i - cdef np.ndarray token_ids = numpy.zeros((nr_state, nr_feat), dtype='i') - cdef np.ndarray is_valid = numpy.zeros((nr_state, nr_class), dtype='i') - cdef np.ndarray scores - c_token_ids = token_ids.data - c_is_valid = is_valid.data - cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) - while not next_step.empty(): - if not has_hidden: - for i in cython.parallel.prange( - next_step.size(), num_threads=6, nogil=True): - self._parse_step(next_step[i], - feat_weights, nr_class, nr_feat, nr_piece) - else: - for i in range(next_step.size()): - st = next_step[i] - st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) - self.moves.set_valid(&c_is_valid[i*nr_class], st) - vectors = state2vec(token_ids[:next_step.size()]) - scores = vec2scores(vectors) - c_scores = scores.data - for i in range(next_step.size()): - st = next_step[i] - guess = arg_max_if_valid( - &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) - action = self.moves.c[guess] - action.do(st, action.label) - this_step, next_step = next_step, this_step - next_step.clear() - for st in this_step: - if not st.is_final(): - next_step.push_back(st) - return states + cdef np.ndarray hidden_weights = numpy.ascontiguousarray( + vec2scores._layers[-1].W.T) + cdef np.ndarray hidden_bias = vec2scores._layers[-1].b - def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): + hW = hidden_weights.data + hb = hidden_bias.data + bias = state2vec.bias.data + cdef int nr_hidden = hidden_weights.shape[0] + cdef int nr_task = states.size() + with nogil: + for i in range(nr_task): + self._parseC(states[i], + feat_weights, bias, hW, hb, + nr_class, nr_hidden, nr_feat, nr_piece) + PyErr_CheckSignals() + return state_objs + + cdef void _parseC(self, StateC* state, + const float* feat_weights, const float* bias, + const float* hW, const float* hb, + int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: + token_ids = calloc(nr_feat, sizeof(int)) + is_valid = calloc(nr_class, sizeof(int)) + vectors = calloc(nr_hidden * nr_piece, sizeof(float)) + scores = calloc(nr_class, sizeof(float)) + if not (token_ids and is_valid and vectors and scores): + with gil: + PyErr_SetFromErrno(MemoryError) + PyErr_CheckSignals() + cdef float feature + while not state.is_final(): + state.set_context_tokens(token_ids, nr_feat) + memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) + memset(scores, 0, nr_class * sizeof(float)) + sum_state_features(vectors, + feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece) + for i in range(nr_hidden * nr_piece): + vectors[i] += bias[i] + V = vectors + W = hW + for i in range(nr_hidden): + if nr_piece == 1: + feature = V[0] if V[0] >= 0. else 0. + elif nr_piece == 2: + feature = V[0] if V[0] >= V[1] else V[1] + else: + feature = Vec.max(V, nr_piece) + for j in range(nr_class): + scores[j] += feature * W[j] + W += nr_class + V += nr_piece + for i in range(nr_class): + scores[i] += hb[i] + self.moves.set_valid(is_valid, state) + guess = arg_max_if_valid(scores, is_valid, nr_class) + action = self.moves.c[guess] + action.do(state, action.label) + state.push_hist(guess) + free(token_ids) + free(is_valid) + free(vectors) + free(scores) + + def beam_parse(self, docs, int beam_width=3, float beam_density=0.001): cdef Beam beam cdef np.ndarray scores cdef Doc doc cdef int nr_class = self.moves.n_moves cdef StateClass stcls, output - tokvecs = self.model[0].ops.flatten(tokvecses) - if USE_FINE_TUNE: - tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) - cuda_stream = get_cuda_stream() - state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, - cuda_stream, 0.0) + cuda_stream = util.get_cuda_stream() + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( + docs, cuda_stream, 0.0) beams = [] cdef int offset = 0 cdef int j = 0 @@ -479,7 +485,12 @@ cdef class Parser: states.append(stcls) token_ids = self.get_token_ids(states) vectors = state2vec(token_ids) - scores = vec2scores(vectors) + if self.cfg.get('hist_size', 0): + hists = numpy.asarray([st.history[:self.cfg['hist_size']] + for st in states], dtype='i') + scores = vec2scores((vectors, hists)) + else: + scores = vec2scores(vectors) j = 0 c_scores = scores.data for i in range(beam.size): @@ -494,51 +505,22 @@ cdef class Parser: beams.append(beam) return beams - cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat, int nr_piece) nogil: - '''This only works with no hidden layers -- fast but inaccurate''' - #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): - # self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) - token_ids = calloc(nr_feat, sizeof(int)) - scores = calloc(nr_class * nr_piece, sizeof(float)) - is_valid = calloc(nr_class, sizeof(int)) - - state.set_context_tokens(token_ids, nr_feat) - sum_state_features(scores, - feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece) - self.moves.set_valid(is_valid, state) - guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece) - action = self.moves.c[guess] - action.do(state, action.label) - - free(is_valid) - free(scores) - free(token_ids) - - def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None): if not any(self.moves.has_gold(gold) for gold in golds): return None if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: - return self.update_beam(docs_tokvecs, golds, + return self.update_beam(docs, golds, self.cfg['beam_width'], self.cfg['beam_density'], drop=drop, sgd=sgd, losses=losses) if losses is not None and self.name not in losses: losses[self.name] = 0. - docs, tokvec_lists = docs_tokvecs - tokvecs = self.model[0].ops.flatten(tokvec_lists) if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] - if USE_FINE_TUNE: - tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) - tokvecs = self.model[0].ops.flatten(tokvecs) - - cuda_stream = get_cuda_stream() - + cuda_stream = util.get_cuda_stream() states, golds, max_steps = self._init_gold_batch(docs, golds) - state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, - 0.0) + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, + drop) todo = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] if not todo: @@ -550,13 +532,16 @@ cdef class Parser: n_steps = 0 while todo: states, golds = zip(*todo) - token_ids = self.get_token_ids(states) vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) if drop != 0: mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) vector *= mask - scores, bp_scores = vec2scores.begin_update(vector, drop=drop) + hists = numpy.asarray([st.history for st in states], dtype='i') + if self.cfg.get('hist_size', 0): + scores, bp_scores = vec2scores.begin_update((vector, hists), drop=drop) + else: + scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) d_scores /= len(docs) @@ -568,27 +553,24 @@ cdef class Parser: and not isinstance(token_ids, state2vec.ops.xp.ndarray): # Move token_ids and d_vector to GPU, asynchronously backprops.append(( - get_async(cuda_stream, token_ids), - get_async(cuda_stream, d_vector), + util.get_async(cuda_stream, token_ids), + util.get_async(cuda_stream, d_vector), bp_vector )) else: backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) - todo = [st for st in todo if not st[0].is_final()] + todo = [(st, gold) for (st, gold) in todo + if not st.is_final()] if losses is not None: losses[self.name] += (d_scores**2).sum() n_steps += 1 if n_steps >= max_steps: break self._make_updates(d_tokvecs, - backprops, sgd, cuda_stream) - d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) - if USE_FINE_TUNE: - d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) - return d_tokvecs + bp_tokvecs, backprops, sgd, cuda_stream) - def update_beam(self, docs_tokvecs, golds, width=None, density=None, + def update_beam(self, docs, golds, width=None, density=None, drop=0., sgd=None, losses=None): if not any(self.moves.has_gold(gold) for gold in golds): return None @@ -600,26 +582,18 @@ cdef class Parser: density = self.cfg.get('beam_density', 0.0) if losses is not None and self.name not in losses: losses[self.name] = 0. - docs, tokvecs = docs_tokvecs lengths = [len(d) for d in docs] assert min(lengths) >= 1 - tokvecs = self.model[0].ops.flatten(tokvecs) - if USE_FINE_TUNE: - tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) - tokvecs = self.model[0].ops.flatten(tokvecs) - states = self.moves.init_batch(docs) for gold in golds: self.moves.preprocess_gold(gold) - - cuda_stream = get_cuda_stream() - state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) - - states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, - states, tokvecs, golds, - state2vec, vec2scores, - width, density, - sgd=sgd, drop=drop, losses=losses) + cuda_stream = util.get_cuda_stream() + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( + docs, cuda_stream, drop) + states_d_scores, backprops = _beam_utils.update_beam( + self.moves, self.nr_feature, 500, states, golds, state2vec, + vec2scores, width, density, self.cfg.get('hist_size', 0), + drop=drop, losses=losses) backprop_lower = [] cdef float batch_size = len(docs) for i, d_scores in enumerate(states_d_scores): @@ -631,17 +605,14 @@ cdef class Parser: if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(ids, state2vec.ops.xp.ndarray): backprop_lower.append(( - get_async(cuda_stream, ids), - get_async(cuda_stream, d_vector), + util.get_async(cuda_stream, ids), + util.get_async(cuda_stream, d_vector), bp_vectors)) else: backprop_lower.append((ids, d_vector, bp_vectors)) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) - self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) - d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) - if USE_FINE_TUNE: - d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) - return d_tokvecs + self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, + cuda_stream) def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long @@ -668,6 +639,7 @@ cdef class Parser: while state.B(0) < start and not state.is_final(): action = self.moves.c[oracle_actions.pop(0)] action.do(state.c, action.label) + state.c.push_hist(action.clas) n_moves += 1 has_gold = self.moves.has_gold(gold, start=start, end=start+max_length) @@ -679,17 +651,19 @@ cdef class Parser: max_moves = max(max_moves, len(oracle_actions)) return states, golds, max_moves - def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): + def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None): # Tells CUDA to block, so our async copies complete. if cuda_stream is not None: cuda_stream.synchronize() xp = get_array_module(d_tokvecs) for ids, d_vector, bp_vector in backprops: d_state_features = bp_vector(d_vector, sgd=sgd) - mask = ids >= 0 - d_state_features *= mask.reshape(ids.shape + (1,)) - self.model[0].ops.scatter_add(d_tokvecs, ids * mask, + ids = ids.flatten() + d_state_features = d_state_features.reshape( + (ids.size, d_state_features.shape[2])) + self.model[0].ops.scatter_add(d_tokvecs, ids, d_state_features) + bp_tokvecs(d_tokvecs, sgd=sgd) @property def move_names(self): @@ -699,13 +673,14 @@ cdef class Parser: names.append(name) return names - def get_batch_model(self, batch_size, tokvecs, stream, dropout): - _, lower, upper = self.model - state2vec = precompute_hiddens(batch_size, tokvecs, - lower, stream, drop=dropout) - return state2vec, upper + def get_batch_model(self, docs, stream, dropout): + tok2vec, lower, upper = self.model + tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout) + state2vec = precompute_hiddens(len(docs), tokvecs, + lower, stream, drop=0.0) + return (tokvecs, bp_tokvecs), state2vec, upper - nr_feature = 8 + nr_feature = 13 def get_token_ids(self, states): cdef StateClass state @@ -729,6 +704,7 @@ cdef class Parser: action = self.moves.c[guess] action.do(state.c, action.label) c_scores += scores.shape[1] + state.c.push_hist(guess) def get_batch_loss(self, states, golds, float[:, ::1] scores): cdef StateClass state @@ -757,27 +733,59 @@ cdef class Parser: for i in range(doc.length): doc.c[i] = state.c._sent[i] self.moves.finalize_doc(doc) + for hook in self.postprocesses: + for doc in docs: + hook(doc) + + @property + def postprocesses(self): + # Available for subclasses, e.g. to deprojectivize + return [] def add_label(self, label): + resized = False for action in self.moves.action_types: added = self.moves.add_action(action, label) if added: # Important that the labels be stored as a list! We need the # order, or the model goes out of synch self.cfg.setdefault('extra_labels', []).append(label) + resized = True + if self.model not in (True, False, None) and resized: + # Weights are stored in (nr_out, nr_in) format, so we're basically + # just adding rows here. + smaller = self.model[-1]._layers[-1] + larger = Affine(self.moves.n_moves, smaller.nI) + copy_array(larger.W[:smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model[-1]._layers[-1] = larger - def begin_training(self, gold_tuples, **cfg): + def begin_training(self, gold_tuples, pipeline=None, **cfg): if 'model' in cfg: self.model = cfg['model'] - gold_tuples = nonproj.preprocess_training_data(gold_tuples) + gold_tuples = nonproj.preprocess_training_data(gold_tuples, + label_freq_cutoff=100) actions = self.moves.get_actions(gold_parses=gold_tuples) for action, labels in actions.items(): for label in labels: self.moves.add_action(action, label) if self.model is True: + cfg['pretrained_dims'] = self.vocab.vectors_length self.model, cfg = self.Model(self.moves.n_moves, **cfg) + self.init_multitask_objectives(gold_tuples, pipeline, **cfg) + link_vectors_to_models(self.vocab) self.cfg.update(cfg) + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + '''Setup models for secondary objectives, to benefit from multi-task + learning. This method is intended to be overridden by subclasses. + + For instance, the dependency parser can benefit from sharing + an input representation with a label prediction model. These auxiliary + models are discarded after training. + ''' + pass + def preprocess_gold(self, docs_golds): for doc, gold in docs_golds: yield doc, gold @@ -813,6 +821,7 @@ cdef class Parser: if 'model' not in exclude: path = util.ensure_path(path) if self.model is True: + self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model, cfg = self.Model(**self.cfg) else: cfg = {} @@ -835,7 +844,7 @@ cdef class Parser: ('upper_model', lambda: self.model[2].to_bytes()), ('vocab', lambda: self.vocab.to_bytes()), ('moves', lambda: self.moves.to_bytes(strings=False)), - ('cfg', lambda: ujson.dumps(self.cfg)) + ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True)) )) if 'model' in exclude: exclude['tok2vec_model'] = True @@ -848,7 +857,7 @@ cdef class Parser: deserializers = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)), - ('cfg', lambda b: self.cfg.update(ujson.loads(b))), + ('cfg', lambda b: self.cfg.update(json.loads(b))), ('tok2vec_model', lambda b: None), ('lower_model', lambda b: None), ('upper_model', lambda b: None) @@ -856,9 +865,11 @@ cdef class Parser: msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: if self.model is True: - self.model, cfg = self.Model(self.moves.n_moves) + self.model, cfg = self.Model(**self.cfg) + cfg['pretrained_dims'] = self.vocab.vectors_length else: cfg = {} + cfg['pretrained_dims'] = self.vocab.vectors_length if 'tok2vec_model' in msg: self.model[0].from_bytes(msg['tok2vec_model']) if 'lower_model' in msg: @@ -904,31 +915,6 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no return best -cdef int arg_maxout_if_valid(const weight_t* scores, const int* is_valid, - int n, int nP) nogil: - cdef int best = -1 - cdef float best_score = 0 - for i in range(n): - if is_valid[i] >= 1: - for j in range(nP): - if best == -1 or scores[i*nP+j] > best_score: - best = i - best_score = scores[i*nP+j] - return best - - -cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, - int nr_class) except -1: - cdef weight_t score = 0 - cdef int mode = -1 - cdef int i - for i in range(nr_class): - if actions[i].move == move and (mode == -1 or scores[i] >= score): - mode = i - score = scores[i] - return mode - - # These are passed as callbacks to thinc.search.Beam cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: dest = _dest @@ -936,6 +922,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) moves = _moves dest.clone(src) moves[clas].do(dest.c, moves[clas].label) + dest.c.push_hist(clas) cdef int _check_final_state(void* _state, void* extra_args) except -1: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 499effcda..404f1bc90 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,39 +1,37 @@ # coding: utf-8 -""" -Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 +"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from __future__ import unicode_literals + from copy import copy -from ..tokens.doc cimport Doc -from ..attrs import DEP, HEAD DELIMITER = '||' def ancestors(tokenid, heads): - # returns all words going from the word up the path to the root - # the path to root cannot be longer than the number of words in the sentence - # this function ends after at most len(heads) steps - # because it would otherwise loop indefinitely on cycles + # Returns all words going from the word up the path to the root. The path + # to root cannot be longer than the number of words in the sentence. This + # function ends after at most len(heads) steps, because it would otherwise + # loop indefinitely on cycles. head = tokenid cnt = 0 while heads[head] != head and cnt < len(heads): head = heads[head] cnt += 1 yield head - if head == None: + if head is None: break def contains_cycle(heads): - # in an acyclic tree, the path from each word following - # the head relation upwards always ends at the root node + # in an acyclic tree, the path from each word following the head relation + # upwards always ends at the root node for tokenid in range(len(heads)): seen = set([tokenid]) - for ancestor in ancestors(tokenid,heads): + for ancestor in ancestors(tokenid, heads): if ancestor in seen: return seen seen.add(ancestor) @@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads): # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] - if head == tokenid: # root arcs cannot be non-projective + if head == tokenid: # root arcs cannot be non-projective return False - elif head == None: # unattached tokens cannot be non-projective + elif head is None: # unattached tokens cannot be non-projective return False start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) - for k in range(start,end): - for ancestor in ancestors(k,heads): - if ancestor == None: # for unattached tokens/subtrees + for k in range(start, end): + for ancestor in ancestors(k, heads): + if ancestor is None: # for unattached tokens/subtrees break - elif ancestor == head: # normal case: k dominated by h + elif ancestor == head: # normal case: k dominated by h break - else: # head not in ancestors: d -> h is non-projective + else: # head not in ancestors: d -> h is non-projective return True return False def is_nonproj_tree(heads): # a tree is non-projective if at least one arc is non-projective - return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) + return any(is_nonproj_arc(word, heads) for word in range(len(heads))) def decompose(label): @@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30): for raw_text, sents in gold_tuples: prepro_sents = [] for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads,deco_labels = projectivize(heads,labels) + proj_heads, deco_labels = projectivize(heads, labels) # set the label to ROOT for each root dependent - deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] + deco_labels = ['ROOT' if head == i else deco_labels[i] + for i, head in enumerate(proj_heads)] # count label frequencies if label_freq_cutoff > 0: for label in deco_labels: if is_decorated(label): - freqs[label] = freqs.get(label,0) + 1 - prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) + freqs[label] = freqs.get(label, 0) + 1 + prepro_sents.append( + ((ids, words, tags, proj_heads, deco_labels, iob), ctnts)) preprocessed.append((raw_text, prepro_sents)) - if label_freq_cutoff > 0: - return _filter_labels(preprocessed,label_freq_cutoff,freqs) + return _filter_labels(preprocessed, label_freq_cutoff, freqs) return preprocessed def projectivize(heads, labels): - # use the algorithm by Nivre & Nilsson 2005 - # assumes heads to be a proper tree, i.e. connected and cycle-free - # returns a new pair (heads,labels) which encode - # a projective and decorated tree + # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper + # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) + # which encode a projective and decorated tree. proj_heads = copy(heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) - if smallest_np_arc == None: # this sentence is already projective + if smallest_np_arc is None: # this sentence is already projective return proj_heads, copy(labels) - while smallest_np_arc != None: + while smallest_np_arc is not None: _lift(smallest_np_arc, proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) deco_labels = _decorate(heads, proj_heads, labels) @@ -114,24 +112,26 @@ def projectivize(heads, labels): def deprojectivize(tokens): - # reattach arcs with decorated labels (following HEAD scheme) - # for each decorated arc X||Y, search top-down, left-to-right, - # breadth-first until hitting a Y then make this the new head + # Reattach arcs with decorated labels (following HEAD scheme). For each + # decorated arc X||Y, search top-down, left-to-right, breadth-first until + # hitting a Y then make this the new head. for token in tokens: if is_decorated(token.dep_): - newlabel,headlabel = decompose(token.dep_) - newhead = _find_new_head(token,headlabel) + newlabel, headlabel = decompose(token.dep_) + newhead = _find_new_head(token, headlabel) token.head = newhead token.dep_ = newlabel return tokens + def _decorate(heads, proj_heads, labels): # uses decoration scheme HEAD from Nivre & Nilsson 2005 assert(len(heads) == len(proj_heads) == len(labels)) deco_labels = [] - for tokenid,head in enumerate(heads): + for tokenid, head in enumerate(heads): if head != proj_heads[tokenid]: - deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) + deco_labels.append( + '%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) else: deco_labels.append(labels[tokenid]) return deco_labels @@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads): # and ties are broken left to right smallest_size = float('inf') smallest_np_arc = None - for tokenid,head in enumerate(heads): + for tokenid, head in enumerate(heads): size = abs(tokenid-head) - if size < smallest_size and is_nonproj_arc(tokenid,heads): + if size < smallest_size and is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc @@ -168,8 +168,10 @@ def _find_new_head(token, headlabel): next_queue = [] for qtoken in queue: for child in qtoken.children: - if child.is_space: continue - if child == token: continue + if child.is_space: + continue + if child == token: + continue if child.dep_ == headlabel: return child next_queue.append(child) @@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs): for raw_text, sents in gold_tuples: filtered_sents = [] for (ids, words, tags, heads, labels, iob), ctnts in sents: - filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] - filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) + filtered_labels = [decompose(label)[0] + if freqs.get(label, cutoff) < cutoff + else label for label in labels] + filtered_sents.append( + ((ids, words, tags, heads, filtered_labels, iob), ctnts)) filtered.append((raw_text, filtered_sents)) return filtered diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd deleted file mode 100644 index 95b6c3d3f..000000000 --- a/spacy/syntax/parser.pxd +++ /dev/null @@ -1,24 +0,0 @@ -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.typedefs cimport atom_t -from thinc.structs cimport FeatureC - -from .stateclass cimport StateClass -from .arc_eager cimport TransitionSystem -from ..vocab cimport Vocab -from ..tokens.doc cimport Doc -from ..structs cimport TokenC -from ._state cimport StateC - - -cdef class ParserModel(AveragedPerceptron): - cdef int set_featuresC(self, atom_t* context, FeatureC* features, - const StateC* state) nogil - - -cdef class Parser: - cdef readonly Vocab vocab - cdef readonly ParserModel model - cdef readonly TransitionSystem moves - cdef readonly object cfg - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx deleted file mode 100644 index 78698db12..000000000 --- a/spacy/syntax/parser.pyx +++ /dev/null @@ -1,526 +0,0 @@ -""" -MALT-style dependency parser -""" -# coding: utf-8 -# cython: infer_types=True -from __future__ import unicode_literals - -from collections import Counter -import ujson - -cimport cython -cimport cython.parallel - -import numpy.random - -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport malloc, calloc, free -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport VecVec -from thinc.structs cimport SparseArrayC, FeatureC, ExampleC -from thinc.extra.eg cimport Example -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport hash64 -from preshed.maps cimport MapStruct -from preshed.maps cimport map_get - -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context -from .stateclass cimport StateClass -from ._state cimport StateC -from .transition_system import OracleError -from .transition_system cimport TransitionSystem, Transition -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from ..gold cimport GoldParse - - -USE_FTRL = True -DEBUG = False -def set_debug(val): - global DEBUG - DEBUG = val - - -def get_templates(name): - pf = _parse_features - if name == 'ner': - return pf.ner - elif name == 'debug': - return pf.unigrams - elif name.startswith('embed'): - return (pf.words, pf.tags, pf.labels) - else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ - pf.tree_shape + pf.trigrams) - - -cdef class ParserModel(AveragedPerceptron): - cdef int set_featuresC(self, atom_t* context, FeatureC* features, - const StateC* state) nogil: - fill_context(context, state) - nr_feat = self.extracter.set_features(features, context) - return nr_feat - - def update(self, Example eg, itn=0): - """ - Does regression on negative cost. Sort of cute? - """ - self.time += 1 - cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) - cdef int guess = eg.guess - if guess == best or best == -1: - return 0.0 - cdef FeatureC feat - cdef int clas - cdef weight_t gradient - if USE_FTRL: - for feat in eg.c.features[:eg.c.nr_feat]: - for clas in range(eg.c.nr_class): - if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]: - gradient = eg.c.scores[clas] + eg.c.costs[clas] - self.update_weight_ftrl(feat.key, clas, feat.value * gradient) - else: - for feat in eg.c.features[:eg.c.nr_feat]: - self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess]) - self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess]) - return eg.c.costs[guess] - - def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0): - cdef Pool mem = Pool() - features = mem.alloc(self.nr_feat, sizeof(FeatureC)) - - cdef StateClass stcls - - cdef class_t clas - self.time += 1 - cdef atom_t[CONTEXT_SIZE] atoms - histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist] - if not histories: - return None - gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))] - for d_loss, history in histories: - stcls = StateClass.init(doc.c, doc.length) - moves.initialize_state(stcls.c) - for clas in history: - nr_feat = self.set_featuresC(atoms, features, stcls.c) - clas_grad = gradient[clas] - for feat in features[:nr_feat]: - clas_grad[feat.key] += d_loss * feat.value - moves.c[clas].do(stcls.c, moves.c[clas].label) - cdef feat_t key - cdef weight_t d_feat - for clas, clas_grad in enumerate(gradient): - for key, d_feat in clas_grad.items(): - if d_feat != 0: - self.update_weight_ftrl(key, clas, d_feat) - - -cdef class Parser: - """ - Base class of the DependencyParser and EntityRecognizer. - """ - @classmethod - def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): - """ - Load the statistical model from the supplied path. - - Arguments: - path (Path): - The path to load from. - vocab (Vocab): - The vocabulary. Must be shared by the documents to be processed. - require (bool): - Whether to raise an error if the files are not found. - Returns (Parser): - The newly constructed object. - """ - with (path / 'config.json').open() as file_: - cfg = ujson.load(file_) - # TODO: remove this shim when we don't have to support older data - if 'labels' in cfg and 'actions' not in cfg: - cfg['actions'] = cfg.pop('labels') - # TODO: remove this shim when we don't have to support older data - for action_name, labels in dict(cfg.get('actions', {})).items(): - # We need this to be sorted - if isinstance(labels, dict): - labels = list(sorted(labels.keys())) - cfg['actions'][action_name] = labels - self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) - if (path / 'model').exists(): - self.model.load(str(path / 'model')) - elif require: - raise IOError( - "Required file %s/model not found when loading" % str(path)) - return self - - def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): - """ - Create a Parser. - - Arguments: - vocab (Vocab): - The vocabulary object. Must be shared with documents to be processed. - model (thinc.linear.AveragedPerceptron): - The statistical model. - Returns (Parser): - The newly constructed object. - """ - if TransitionSystem is None: - TransitionSystem = self.TransitionSystem - self.vocab = vocab - cfg['actions'] = TransitionSystem.get_actions(**cfg) - self.moves = TransitionSystem(vocab.strings, cfg['actions']) - # TODO: Remove this when we no longer need to support old-style models - if isinstance(cfg.get('features'), basestring): - cfg['features'] = get_templates(cfg['features']) - elif 'features' not in cfg: - cfg['features'] = self.feature_templates - - self.model = ParserModel(cfg['features']) - self.model.l1_penalty = cfg.get('L1', 0.0) - self.model.learn_rate = cfg.get('learn_rate', 0.001) - - self.cfg = cfg - # TODO: This is a pretty hacky fix to the problem of adding more - # labels. The issue is they come in out of order, if labels are - # added during training - for label in cfg.get('extra_labels', []): - self.add_label(label) - - def __reduce__(self): - return (Parser, (self.vocab, self.moves, self.model), None, None) - - def __call__(self, Doc tokens): - """ - Apply the entity recognizer, setting the annotations onto the Doc object. - - Arguments: - doc (Doc): The document to be processed. - Returns: - None - """ - cdef int nr_feat = self.model.nr_feat - with nogil: - status = self.parseC(tokens.c, tokens.length, nr_feat) - # Check for KeyboardInterrupt etc. Untested - PyErr_CheckSignals() - if status != 0: - raise ParserStateError(tokens) - self.moves.finalize_doc(tokens) - - def pipe(self, stream, int batch_size=1000, int n_threads=2): - """ - Process a stream of documents. - - Arguments: - stream: The sequence of documents to process. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel. - Yields (Doc): Documents, in order. - """ - cdef Pool mem = Pool() - cdef TokenC** doc_ptr = mem.alloc(batch_size, sizeof(TokenC*)) - cdef int* lengths = mem.alloc(batch_size, sizeof(int)) - cdef Doc doc - cdef int i - cdef int nr_feat = self.model.nr_feat - cdef int status - queue = [] - for doc in stream: - doc_ptr[len(queue)] = doc.c - lengths[len(queue)] = doc.length - queue.append(doc) - if len(queue) == batch_size: - with nogil: - for i in cython.parallel.prange(batch_size, num_threads=n_threads): - status = self.parseC(doc_ptr[i], lengths[i], nr_feat) - if status != 0: - with gil: - raise ParserStateError(queue[i]) - PyErr_CheckSignals() - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - queue = [] - batch_size = len(queue) - with nogil: - for i in cython.parallel.prange(batch_size, num_threads=n_threads): - status = self.parseC(doc_ptr[i], lengths[i], nr_feat) - if status != 0: - with gil: - raise ParserStateError(queue[i]) - PyErr_CheckSignals() - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: - state = new StateC(tokens, length) - # NB: This can change self.moves.n_moves! - # I think this causes memory errors if called by .pipe() - self.moves.initialize_state(state) - nr_class = self.moves.n_moves - - cdef ExampleC eg - eg.nr_feat = nr_feat - eg.nr_atom = CONTEXT_SIZE - eg.nr_class = nr_class - eg.features = calloc(sizeof(FeatureC), nr_feat) - eg.atoms = calloc(sizeof(atom_t), CONTEXT_SIZE) - eg.scores = calloc(sizeof(weight_t), nr_class) - eg.is_valid = calloc(sizeof(int), nr_class) - cdef int i - while not state.is_final(): - eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state) - self.moves.set_valid(eg.is_valid, state) - self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat) - - guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class) - if guess < 0: - return 1 - - action = self.moves.c[guess] - - action.do(state, action.label) - memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) - for i in range(eg.nr_class): - eg.is_valid[i] = 1 - self.moves.finalize_state(state) - for i in range(length): - tokens[i] = state._sent[i] - del state - free(eg.features) - free(eg.atoms) - free(eg.scores) - free(eg.is_valid) - return 0 - - def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0): - """ - Update the statistical model. - - Arguments: - doc (Doc): - The example document for the update. - gold (GoldParse): - The gold-standard annotations, to calculate the loss. - Returns (float): - The loss on this example. - """ - self.moves.preprocess_gold(gold) - cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) - self.moves.initialize_state(stcls.c) - cdef Pool mem = Pool() - cdef Example eg = Example( - nr_class=self.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.model.nr_feat) - cdef weight_t loss = 0 - cdef Transition action - cdef double dropout_rate = self.cfg.get('dropout', drop) - while not stcls.is_final(): - eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, - stcls.c) - dropout(eg.c.features, eg.c.nr_feat, dropout_rate) - self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) - guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) - self.model.update(eg) - - action = self.moves.c[guess] - action.do(stcls.c, action.label) - loss += eg.costs[guess] - eg.fill_scores(0, eg.c.nr_class) - eg.fill_costs(0, eg.c.nr_class) - eg.fill_is_valid(1, eg.c.nr_class) - - self.moves.finalize_state(stcls.c) - return loss - - def step_through(self, Doc doc, GoldParse gold=None): - """ - Set up a stepwise state, to introspect and control the transition sequence. - - Arguments: - doc (Doc): The document to step through. - gold (GoldParse): Optional gold parse - Returns (StepwiseState): - A state object, to step through the annotation process. - """ - return StepwiseState(self, doc, gold=gold) - - def from_transition_sequence(self, Doc doc, sequence): - """Control the annotations on a document by specifying a transition sequence - to follow. - - Arguments: - doc (Doc): The document to annotate. - sequence: A sequence of action names, as unicode strings. - Returns: None - """ - with self.step_through(doc) as stepwise: - for transition in sequence: - stepwise.transition(transition) - - def add_label(self, label): - # Doesn't set label into serializer -- subclasses override it to do that. - for action in self.moves.action_types: - added = self.moves.add_action(action, label) - if added: - # Important that the labels be stored as a list! We need the - # order, or the model goes out of synch - self.cfg.setdefault('extra_labels', []).append(label) - - -cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: - if prob <= 0 or prob >= 1.: - return 0 - cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat) - cdef double* probs = &py_probs[0] - for i in range(nr_feat): - if probs[i] >= prob: - feats[i].value /= prob - else: - feats[i].value = 0. - - -cdef class StepwiseState: - cdef readonly StateClass stcls - cdef readonly Example eg - cdef readonly Doc doc - cdef readonly GoldParse gold - cdef readonly Parser parser - - def __init__(self, Parser parser, Doc doc, GoldParse gold=None): - self.parser = parser - self.doc = doc - if gold is not None: - self.gold = gold - self.parser.moves.preprocess_gold(self.gold) - else: - self.gold = GoldParse(doc) - self.stcls = StateClass.init(doc.c, doc.length) - self.parser.moves.initialize_state(self.stcls.c) - self.eg = Example( - nr_class=self.parser.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.parser.model.nr_feat) - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.finish() - - @property - def is_final(self): - return self.stcls.is_final() - - @property - def stack(self): - return self.stcls.stack - - @property - def queue(self): - return self.stcls.queue - - @property - def heads(self): - return [self.stcls.H(i) for i in range(self.stcls.c.length)] - - @property - def deps(self): - return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] - for i in range(self.stcls.c.length)] - - @property - def costs(self): - """ - Find the action-costs for the current state. - """ - if not self.gold: - raise ValueError("Can't set costs: No GoldParse provided") - self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, - self.stcls, self.gold) - costs = {} - for i in range(self.parser.moves.n_moves): - if not self.eg.c.is_valid[i]: - continue - transition = self.parser.moves.c[i] - name = self.parser.moves.move_name(transition.move, transition.label) - costs[name] = self.eg.c.costs[i] - return costs - - def predict(self): - self.eg.reset() - self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, - self.stcls.c) - self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) - self.parser.model.set_scoresC(self.eg.c.scores, - self.eg.c.features, self.eg.c.nr_feat) - - cdef Transition action = self.parser.moves.c[self.eg.guess] - return self.parser.moves.move_name(action.move, action.label) - - def transition(self, action_name=None): - if action_name is None: - action_name = self.predict() - moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} - if action_name == '_': - action_name = self.predict() - action = self.parser.moves.lookup_transition(action_name) - elif action_name == 'L' or action_name == 'R': - self.predict() - move = moves[action_name] - clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, - self.eg.c.nr_class) - action = self.parser.moves.c[clas] - else: - action = self.parser.moves.lookup_transition(action_name) - action.do(self.stcls.c, action.label) - - def finish(self): - if self.stcls.is_final(): - self.parser.moves.finalize_state(self.stcls.c) - self.doc.set_parse(self.stcls.c._sent) - self.parser.moves.finalize_doc(self.doc) - - -class ParserStateError(ValueError): - def __init__(self, doc): - ValueError.__init__(self, - "Error analysing doc -- no valid actions available. This should " - "never happen, so please report the error on the issue tracker. " - "Here's the thread to do so --- reopen it if it's closed:\n" - "https://github.com/spacy-io/spaCy/issues/429\n" - "Please include the text that the parser failed on, which is:\n" - "%s" % repr(doc.text)) - -cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil: - cdef int best = -1 - for i in range(n): - if costs[i] <= 0: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - -cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, - int nr_class) except -1: - cdef weight_t score = 0 - cdef int mode = -1 - cdef int i - for i in range(nr_class): - if actions[i].move == move and (mode == -1 or scores[i] >= score): - mode = i - score = scores[i] - return mode diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 228a3ff91..ea0ec77e5 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -2,16 +2,8 @@ # cython: infer_types=True from __future__ import unicode_literals -from libc.string cimport memcpy, memset -from libc.stdint cimport uint32_t, uint64_t +import numpy -from ..vocab cimport EMPTY_LEXEME -from ..structs cimport Entity -from ..lexeme cimport Lexeme -from ..symbols cimport punct -from ..attrs cimport IS_SPACE -from ..attrs cimport attr_id_t -from ..tokens.token cimport Token from ..tokens.doc cimport Doc @@ -38,6 +30,13 @@ cdef class StateClass: def token_vector_lenth(self): return self.doc.tensor.shape[1] + @property + def history(self): + hist = numpy.ndarray((8,), dtype='i') + for i in range(8): + hist[i] = self.c.get_hist(i+1) + return hist + def is_final(self): return self.c.is_final() @@ -54,27 +53,3 @@ cdef class StateClass: n0 = words[self.B(0)] n1 = words[self.B(1)] return ' '.join((third, second, top, '|', n0, n1)) - - @classmethod - def nr_context_tokens(cls): - return 13 - - def set_context_tokens(self, int[::1] output): - output[0] = self.B(0) - output[1] = self.B(1) - output[2] = self.S(0) - output[3] = self.S(1) - output[4] = self.S(2) - output[5] = self.L(self.S(0), 1) - output[6] = self.L(self.S(0), 2) - output[6] = self.R(self.S(0), 1) - output[7] = self.L(self.B(0), 1) - output[8] = self.R(self.S(0), 2) - output[9] = self.L(self.S(1), 1) - output[10] = self.L(self.S(1), 2) - output[11] = self.R(self.S(1), 1) - output[12] = self.R(self.S(1), 2) - - for i in range(13): - if output[i] != -1: - output[i] += self.c.offset diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 055129c8b..c351636c4 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -2,17 +2,17 @@ # coding: utf-8 from __future__ import unicode_literals -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t -from collections import defaultdict, OrderedDict +from collections import OrderedDict import ujson -from .. import util from ..structs cimport TokenC from .stateclass cimport StateClass -from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..typedefs cimport attr_t +from ..compat import json_dumps +from .. import util cdef weight_t MIN_SCORE = -90000 @@ -136,11 +136,12 @@ cdef class TransitionSystem: print([gold.c.ner[i].clas for i in range(gold.length)]) print([gold.c.ner[i].move for i in range(gold.length)]) print([gold.c.ner[i].label for i in range(gold.length)]) - print("Self labels", [self.c[i].label for i in range(self.n_moves)]) + print("Self labels", + [self.c[i].label for i in range(self.n_moves)]) raise ValueError( "Could not find a gold-standard action to supervise " - "the entity recognizer\n" - "The transition system has %d actions." % (self.n_moves)) + "the entity recognizer. The transition system has " + "%d actions." % (self.n_moves)) def get_class_name(self, int clas): act = self.c[clas] @@ -148,7 +149,8 @@ cdef class TransitionSystem: def add_action(self, int action, label_name): cdef attr_t label_id - if not isinstance(label_name, (int, long)): + if not isinstance(label_name, int) and \ + not isinstance(label_name, long): label_id = self.strings.add(label_name) else: label_id = label_name @@ -185,7 +187,7 @@ cdef class TransitionSystem: 'name': self.move_name(trans.move, trans.label) }) serializers = { - 'transitions': lambda: ujson.dumps(transitions), + 'transitions': lambda: json_dumps(transitions), 'strings': lambda: self.strings.to_bytes() } return util.to_bytes(serializers, exclude) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd deleted file mode 100644 index 6d2cef1f4..000000000 --- a/spacy/tagger.pxd +++ /dev/null @@ -1,17 +0,0 @@ -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.extra.eg cimport Example -from thinc.structs cimport ExampleC - -from .structs cimport TokenC -from .vocab cimport Vocab - - -cdef class TaggerModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except * - - -cdef class Tagger: - cdef readonly Vocab vocab - cdef readonly TaggerModel model - cdef public dict freqs - cdef public object cfg diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx deleted file mode 100644 index 0fadea15d..000000000 --- a/spacy/tagger.pyx +++ /dev/null @@ -1,253 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import defaultdict - -from cymem.cymem cimport Pool -from thinc.typedefs cimport atom_t -from thinc.extra.eg cimport Example -from thinc.structs cimport ExampleC -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport VecVec - -from .tokens.doc cimport Doc -from .attrs cimport TAG -from .gold cimport GoldParse -from .attrs cimport * - - -cpdef enum: - P2_orth - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_flags - - P1_orth - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_flags - - W_orth - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_flags - - N1_orth - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_flags - - N2_orth - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_flags - - N_CONTEXT_FIELDS - - -cdef class TaggerModel(AveragedPerceptron): - def update(self, Example eg): - self.time += 1 - guess = eg.guess - best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class) - if guess != best: - for feat in eg.c.features[:eg.c.nr_feat]: - self.update_weight(feat.key, best, -feat.value) - self.update_weight(feat.key, guess, feat.value) - - cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: - _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) - _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) - _fill_from_token(&eg.atoms[W_orth], &tokens[i]) - _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1]) - _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2]) - - eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.lower - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.tag - context[6] = t.lemma - if t.lex.flags & (1 << IS_ALPHA): - context[7] = 1 - elif t.lex.flags & (1 << IS_PUNCT): - context[7] = 2 - elif t.lex.flags & (1 << LIKE_URL): - context[7] = 3 - elif t.lex.flags & (1 << LIKE_NUM): - context[7] = 4 - else: - context[7] = 0 - - -cdef class Tagger: - """Annotate part-of-speech tags on Doc objects.""" - - def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): - """Create a Tagger. - - vocab (Vocab): The vocabulary object. Must be shared with documents to - be processed. - model (thinc.linear.AveragedPerceptron): The statistical model. - RETURNS (Tagger): The newly constructed object. - """ - if model is None: - model = TaggerModel(cfg.get('features', self.feature_templates), - L1=0.0) - self.vocab = vocab - self.model = model - self.model.l1_penalty = 0.0 - # TODO: Move this to tag map - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.vocab.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - self.cfg = cfg - - @property - def tag_names(self): - return self.vocab.morphology.tag_names - - def __reduce__(self): - return (self.__class__, (self.vocab, self.model), None, None) - - def tag_from_strings(self, Doc tokens, object tag_strs): - cdef int i - for i in range(tokens.length): - self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i]) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. - - doc (Doc): The tokens to be tagged. - """ - if tokens.length == 0: - return 0 - - cdef Pool mem = Pool() - - cdef int i, tag - cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS, - nr_class=self.vocab.morphology.n_tags, - nr_feat=self.model.nr_feat) - for i in range(tokens.length): - if tokens.c[i].pos == 0: - self.model.set_featuresC(&eg.c, tokens.c, i) - self.model.set_scoresC(eg.c.scores, - eg.c.features, eg.c.nr_feat) - guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) - self.vocab.morphology.assign_tag_id(&tokens.c[i], guess) - eg.fill_scores(0, eg.c.nr_class) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def pipe(self, stream, batch_size=1000, n_threads=2): - """Tag a stream of documents. - - Arguments: - stream: The sequence of documents to tag. - batch_size (int): The number of documents to accumulate into a working set. - n_threads (int): The number of threads with which to work on the buffer - in parallel, if the Matcher implementation supports multi-threading. - YIELDS (Doc): Documents, in order. - """ - for doc in stream: - self(doc) - yield doc - - def update(self, Doc tokens, GoldParse gold, itn=0): - """Update the statistical model, with tags supplied for the given document. - - doc (Doc): The document to update on. - gold (GoldParse): Manager for the gold-standard tags. - RETURNS (int): Number of tags predicted correctly. - """ - gold_tag_strs = gold.tags - assert len(tokens) == len(gold_tag_strs) - for tag in gold_tag_strs: - if tag != None and tag not in self.tag_names: - msg = ("Unrecognized gold tag: %s. tag_map.json must contain all " - "gold tags, to maintain coarse-grained mapping.") - raise ValueError(msg % tag) - golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] - cdef int correct = 0 - cdef Pool mem = Pool() - cdef Example eg = Example( - nr_atom=N_CONTEXT_FIELDS, - nr_class=self.vocab.morphology.n_tags, - nr_feat=self.model.nr_feat) - for i in range(tokens.length): - self.model.set_featuresC(&eg.c, tokens.c, i) - eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] - self.model.set_scoresC(eg.c.scores, - eg.c.features, eg.c.nr_feat) - self.model.update(eg) - - self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) - - correct += eg.cost == 0 - self.freqs[TAG][tokens.c[i].tag] += 1 - eg.fill_scores(0, eg.c.nr_class) - eg.fill_costs(0, eg.c.nr_class) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - return correct - - - feature_templates = ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), - ) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4da1ae301..5fa0c0cb7 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -11,8 +11,12 @@ from ..strings import StringStore from .. import util +# These languages are used for generic tokenizer tests – only add a language +# here if it's using spaCy's tokenizer (not a different library) +# TODO: re-implement generic tokenizer tests _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'ga', 'xx'] + _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_depvec_web_lg'], @@ -42,6 +46,7 @@ def FR(request): #lang = util.get_lang_class(request.param) #return lang.Defaults.create_tokenizer() + @pytest.fixture def tokenizer(): return util.get_lang_class('xx').Defaults.create_tokenizer() @@ -58,8 +63,9 @@ def en_vocab(): @pytest.fixture -def en_parser(): - return util.get_lang_class('en').Defaults.create_parser() +def en_parser(en_vocab): + nlp = util.get_lang_class('en')(en_vocab) + return nlp.create_pipe('parser') @pytest.fixture @@ -86,10 +92,12 @@ def hu_tokenizer(): def fi_tokenizer(): return util.get_lang_class('fi').Defaults.create_tokenizer() + @pytest.fixture def id_tokenizer(): return util.get_lang_class('id').Defaults.create_tokenizer() + @pytest.fixture def sv_tokenizer(): return util.get_lang_class('sv').Defaults.create_tokenizer() @@ -108,10 +116,26 @@ def ga_tokenizer(): def he_tokenizer(): return util.get_lang_class('he').Defaults.create_tokenizer() + @pytest.fixture def nb_tokenizer(): return util.get_lang_class('nb').Defaults.create_tokenizer() +@pytest.fixture +def da_tokenizer(): + return util.get_lang_class('da').Defaults.create_tokenizer() + +@pytest.fixture +def ja_tokenizer(): + janome = pytest.importorskip("janome") + return util.get_lang_class('ja').Defaults.create_tokenizer() + + +@pytest.fixture +def th_tokenizer(): + pythainlp = pytest.importorskip("pythainlp") + return util.get_lang_class('th').Defaults.create_tokenizer() + @pytest.fixture def stringstore(): @@ -127,6 +151,7 @@ def en_entityrecognizer(): def text_file(): return StringIO() + @pytest.fixture def text_file_b(): return BytesIO() diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index cc74aa0ae..cd444ba81 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -10,7 +10,8 @@ import pytest def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, features=[(2,), (3,)]) + ner = EntityRecognizer(en_vocab) + ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index dd87aa763..ff10394d1 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab): assert feats_array[0][0] != feats_array[0][1] +def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array((ORTH, SHAPE)) + feats_array_stringy = tokens.to_array(("ORTH", "SHAPE")) + assert feats_array_stringy[0][0] == feats_array[0][0] + assert feats_array_stringy[0][1] == feats_array[0][1] + + +def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array(ORTH) + assert feats_array.shape == (3,) + + def test_doc_array_tag(en_tokenizer): text = "A nice sentence." pos = ['DET', 'ADJ', 'NOUN', 'PUNCT'] diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py new file mode 100644 index 000000000..c14fdfbe9 --- /dev/null +++ b/spacy/tests/doc/test_creation.py @@ -0,0 +1,37 @@ +'''Test Doc sets up tokens correctly.''' +from __future__ import unicode_literals +import pytest + +from ...vocab import Vocab +from ...tokens.doc import Doc +from ...lemmatizer import Lemmatizer + + +@pytest.fixture +def lemmatizer(): + return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) + + +@pytest.fixture +def vocab(lemmatizer): + return Vocab(lemmatizer=lemmatizer) + + +def test_empty_doc(vocab): + doc = Doc(vocab) + assert len(doc) == 0 + + +def test_single_word(vocab): + doc = Doc(vocab, words=['a']) + assert doc.text == 'a ' + doc = Doc(vocab, words=['a'], spaces=[False]) + assert doc.text == 'a' + + +def test_lookup_lemmatization(vocab): + doc = Doc(vocab, words=['dogs', 'dogses']) + assert doc[0].text == 'dogs' + assert doc[0].lemma_ == 'dog' + assert doc[1].text == 'dogses' + assert doc[1].lemma_ == 'dogses' diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index cbe1bbc66..8f881e811 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from ..util import get_doc +from ...tokens import Doc +from ...vocab import Vocab import pytest import numpy @@ -204,19 +206,20 @@ def test_doc_api_right_edge(en_tokenizer): assert doc[6].right_edge.text == ',' -@pytest.mark.xfail -@pytest.mark.parametrize('text,vectors', [ - ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) -]) -def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): - text_file.write('\n'.join(vectors)) - text_file.seek(0) - vector_length = en_tokenizer.vocab.load_vectors(text_file) - assert vector_length == 3 - - doc = en_tokenizer(text) +def test_doc_api_has_vector(): + vocab = Vocab() + vocab.clear_vectors(2) + vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f')) + doc = Doc(vocab, words=['kitten']) assert doc.has_vector +def test_lowest_common_ancestor(en_tokenizer): + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc.get_lca_matrix() + assert(lca[1, 1] == 1) + assert(lca[0, 1] == 2) + assert(lca[1, 2] == 2) def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py new file mode 100644 index 000000000..93f06f2c3 --- /dev/null +++ b/spacy/tests/doc/test_pickle_doc.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from ...language import Language +from ...compat import pickle, unicode_ + + +def test_pickle_single_doc(): + nlp = Language() + doc = nlp(u'pickle roundtrip') + data = pickle.dumps(doc, 1) + doc2 = pickle.loads(data) + assert doc2.text == 'pickle roundtrip' + + +def test_list_of_docs_pickles_efficiently(): + nlp = Language() + for i in range(10000): + _ = nlp.vocab[unicode_(i)] + one_pickled = pickle.dumps(nlp(u'0'), -1) + docs = list(nlp.pipe(unicode_(i) for i in range(100))) + many_pickled = pickle.dumps(docs, -1) + assert len(many_pickled) < (len(one_pickled) * 2) + many_unpickled = pickle.loads(many_pickled) + assert many_unpickled[0].text == '0' + assert many_unpickled[-1].text == '99' + assert len(many_unpickled) == 100 + + +def test_user_data_from_disk(): + nlp = Language() + doc = nlp(u'Hello') + doc.user_data[(0, 1)] = False + b = doc.to_bytes() + doc2 = doc.__class__(doc.vocab).from_bytes(b) + assert doc2.user_data[(0, 1)] == False + +def test_user_data_unpickles(): + nlp = Language() + doc = nlp(u'Hello') + doc.user_data[(0, 1)] = False + b = pickle.dumps(doc) + doc2 = pickle.loads(b) + assert doc2.user_data[(0, 1)] == False + + +def test_hooks_unpickle(): + def inner_func(d1, d2): + return 'hello!' + nlp = Language() + doc = nlp(u'Hello') + doc.user_hooks['similarity'] = inner_func + b = pickle.dumps(doc) + doc2 = pickle.loads(b) + assert doc2.similarity(None) == 'hello!' diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 00caa1445..a52be9731 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc import pytest import numpy @@ -68,26 +70,21 @@ def test_doc_token_api_is_properties(en_vocab): assert doc[5].like_email -@pytest.mark.xfail -@pytest.mark.parametrize('text,vectors', [ - ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"]) -]) -def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors): - text_file.write('\n'.join(vectors)) - text_file.seek(0) - vector_length = en_tokenizer.vocab.load_vectors(text_file) - assert vector_length == 3 +def test_doc_token_api_vectors(): + vocab = Vocab() + vocab.clear_vectors(2) + vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f')) + vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f')) + doc = Doc(vocab, words=['apples', 'oranges', 'oov']) + assert doc.has_vector - tokens = en_tokenizer(text) - assert tokens[0].has_vector - assert tokens[1].has_vector - assert not tokens[2].has_vector - assert tokens[0].similarity(tokens[1]) > tokens[0].similarity(tokens[2]) - assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0]) - assert sum(tokens[0].vector) != sum(tokens[1].vector) - assert numpy.isclose( - tokens[0].vector_norm, - numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector))) + assert doc[0].has_vector + assert doc[1].has_vector + assert not doc[2].has_vector + apples_norm = (0*0 + 2*2) ** 0.5 + oranges_norm = (0*0 + 1*1) ** 0.5 + cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm) + assert doc[0].similarity(doc[1]) == cosine def test_doc_token_api_ancestors(en_tokenizer): diff --git a/spacy/syntax/iterators.pxd b/spacy/tests/lang/da/__init__.py similarity index 100% rename from spacy/syntax/iterators.pxd rename to spacy/tests/lang/da/__init__.py diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py new file mode 100644 index 000000000..d89fafd2c --- /dev/null +++ b/spacy/tests/lang/da/test_exceptions.py @@ -0,0 +1,15 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec."]) +def test_da_tokenizer_handles_abbr(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + +def test_da_tokenizer_handles_exc_in_text(da_tokenizer): + text = "Det er bl.a. ikke meningen" + tokens = da_tokenizer(text) + assert len(tokens) == 5 + assert tokens[2].text == "bl.a." diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py new file mode 100644 index 000000000..fa6a935f6 --- /dev/null +++ b/spacy/tests/lang/da/test_text.py @@ -0,0 +1,27 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + +def test_da_tokenizer_handles_long_text(da_tokenizer): + text = """Der var så dejligt ude på landet. Det var sommer, kornet stod gult, havren grøn, +høet var rejst i stakke nede i de grønne enge, og der gik storken på sine lange, +røde ben og snakkede ægyptisk, for det sprog havde han lært af sin moder. + +Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der var rigtignok dejligt derude på landet!""" + tokens = da_tokenizer(text) + assert len(tokens) == 84 + +@pytest.mark.parametrize('text,match', [ + ('10', True), ('1', True), ('10.000', True), ('10.00', True), + ('999,0', True), ('en', True), ('treoghalvfemsindstyvende', True), ('hundrede', True), + ('hund', False), (',', False), ('1/2', True)]) +def test_lex_attrs_like_number(da_tokenizer, text, match): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + print(tokens[0]) + assert tokens[0].like_num == match + diff --git a/spacy/tests/lang/de/test_lemma.py b/spacy/tests/lang/de/test_lemma.py new file mode 100644 index 000000000..39b3b0313 --- /dev/null +++ b/spacy/tests/lang/de/test_lemma.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'), + ('engagierte', 'engagieren'), + ('schließt', 'schließen'), + ('vorgebenden', 'vorgebend')]) +def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma): + tokens = de_tokenizer(string) + assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py index dcf4f4ef0..bdc68037e 100644 --- a/spacy/tests/lang/de/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py @@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): assert len(tokens) == 4 -@pytest.mark.parametrize('text', ["blau-rot"]) -def test_tokenizer_splits_hyphens(de_tokenizer, text): - tokens = de_tokenizer(text) - assert len(tokens) == 3 - - @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) def test_tokenizer_splits_numeric_range(de_tokenizer, text): tokens = de_tokenizer(text) @@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): assert len(tokens) == 3 +@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) +def test_tokenizer_keeps_hyphens(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 1 + + def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") - assert len(tokens) == 12 + assert len(tokens) == 10 assert tokens[0].text == "Viele" assert tokens[1].text == "Regeln" assert tokens[2].text == "--" assert tokens[3].text == "wie" assert tokens[4].text == "die" - assert tokens[5].text == "Bindestrich" - assert tokens[6].text == "-" - assert tokens[7].text == "Regeln" - assert tokens[8].text == "--" - assert tokens[9].text == "sind" - assert tokens[10].text == "kompliziert" + assert tokens[5].text == "Bindestrich-Regeln" + assert tokens[6].text == "--" + assert tokens[7].text == "sind" + assert tokens[8].text == "kompliziert" diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index 84fa6f2a5..34180b982 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. assert len(tokens) == 109 -@pytest.mark.parametrize('text,length', [ - ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), - ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), - ("Kraftfahrzeug-Haftpflichtversicherung", 3), - ("Vakuum-Mittelfrequenz-Induktionsofen", 5) +@pytest.mark.parametrize('text', [ + "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", + "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", + "Kraftfahrzeug-Haftpflichtversicherung", + "Vakuum-Mittelfrequenz-Induktionsofen" ]) -def test_tokenizer_handles_long_words(de_tokenizer, text, length): +def test_tokenizer_handles_long_words(de_tokenizer, text): tokens = de_tokenizer(text) - assert len(tokens) == length + assert len(tokens) == 1 @pytest.mark.parametrize('text,length', [ diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py new file mode 100644 index 000000000..1d35fb128 --- /dev/null +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ....lang.en import English +from ....tokenizer import Tokenizer +from .... import util + + +@pytest.fixture +def custom_en_tokenizer(en_vocab): + prefix_re = util.compile_prefix_regex(English.Defaults.prefixes) + suffix_re = util.compile_suffix_regex(English.Defaults.suffixes) + custom_infixes = ['\.\.\.+', + '(?<=[0-9])-(?=[0-9])', + # '(?<=[0-9]+),(?=[0-9]+)', + '[0-9]+(,[0-9]+)+', + '[\[\]!&:,()\*—–\/-]'] + + infix_re = util.compile_infix_regex(custom_infixes) + return Tokenizer(en_vocab, + English.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None) + + +def test_customized_tokenizer_handles_infixes(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == ['The', '8', 'and', '10', '-', 'county', 'definitions', + 'are', 'not', 'used', 'for', 'the', 'greater', + 'Southern', 'California', 'Megaregion', '.'] + + # the trailing '-' may cause Assertion Error + sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == ['The', '8', '-', 'and', '10', '-', 'county', + 'definitions', 'are', 'not', 'used', 'for', 'the', + 'greater', 'Southern', 'California', 'Megaregion', '.'] diff --git a/spacy/tests/lang/en/test_lemmatizer.py b/spacy/tests/lang/en/test_lemmatizer.py index 00f02ccb4..22c8f2499 100644 --- a/spacy/tests/lang/en/test_lemmatizer.py +++ b/spacy/tests/lang/en/test_lemmatizer.py @@ -57,7 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer): def test_en_lemmatizer_lemma_assignment(EN): text = "Bananas in pyjamas are geese." doc = EN.make_doc(text) - EN.tensorizer(doc) - assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) assert all(t.lemma_ != '' for t in doc) diff --git a/spacy/tests/lang/en/test_models.py b/spacy/tests/lang/en/test_models.py index 4b1cf1f91..ab318213c 100644 --- a/spacy/tests/lang/en/test_models.py +++ b/spacy/tests/lang/en/test_models.py @@ -52,12 +52,13 @@ def test_en_models_vectors(example): # this isn't a perfect test since this could in principle fail # in a sane model as well, # but that's very unlikely and a good indicator if something is wrong - vector0 = example[0].vector - vector1 = example[1].vector - vector2 = example[2].vector - assert not numpy.array_equal(vector0,vector1) - assert not numpy.array_equal(vector0,vector2) - assert not numpy.array_equal(vector1,vector2) + if example.vocab.vectors_length: + vector0 = example[0].vector + vector1 = example[1].vector + vector2 = example[2].vector + assert not numpy.array_equal(vector0,vector1) + assert not numpy.array_equal(vector0,vector2) + assert not numpy.array_equal(vector1,vector2) @pytest.mark.xfail diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py index 39d0fce61..9468fe09d 100644 --- a/spacy/tests/lang/en/test_parser.py +++ b/spacy/tests/lang/en/test_parser.py @@ -45,3 +45,33 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer): assert len(chunks) == 2 assert chunks[0].text_with_ws == "A phrase " assert chunks[1].text_with_ws == "another phrase " + + +def test_parser_noun_chunks_appositional_modifiers(en_tokenizer): + text = "Sam, my brother, arrived to the house." + heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4] + tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.'] + deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "Sam " + assert chunks[1].text_with_ws == "my brother " + assert chunks[2].text_with_ws == "the house " + + +def test_parser_noun_chunks_dative(en_tokenizer): + text = "She gave Bob a raise." + heads = [1, 0, -1, 1, -3, -4] + tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.'] + deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "She " + assert chunks[1].text_with_ws == "Bob " + assert chunks[2].text_with_ws == "a raise " diff --git a/spacy/tests/lang/ja/__init__.py b/spacy/tests/lang/ja/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py new file mode 100644 index 000000000..1e30973a3 --- /dev/null +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -0,0 +1,19 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +TOKENIZER_TESTS = [ + ("日本語だよ", ['日本語', 'だ', 'よ']), + ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), + ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']), + ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']), + ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']) +] + + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens): + tokens = [token.text for token in ja_tokenizer(text)] + assert tokens == expected_tokens diff --git a/spacy/tests/lang/th/__init__.py b/spacy/tests/lang/th/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/th/test_tokenizer.py b/spacy/tests/lang/th/test_tokenizer.py new file mode 100644 index 000000000..f5925da1e --- /dev/null +++ b/spacy/tests/lang/th/test_tokenizer.py @@ -0,0 +1,13 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +TOKENIZER_TESTS = [ + ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_thai_tokenizer(th_tokenizer, text, expected_tokens): + tokens = [token.text for token in th_tokenizer(text)] + assert tokens == expected_tokens diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py new file mode 100644 index 000000000..9493452a1 --- /dev/null +++ b/spacy/tests/parser/test_add_label.py @@ -0,0 +1,72 @@ +'''Test the ability to add a label to a (potentially trained) parsing model.''' +from __future__ import unicode_literals +import pytest +import numpy.random +from thinc.neural.optimizers import Adam +from thinc.neural.ops import NumpyOps + +from ...attrs import NORM +from ...gold import GoldParse +from ...vocab import Vocab +from ...tokens import Doc +from ...pipeline import DependencyParser + +numpy.random.seed(0) + + +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + + +@pytest.fixture +def parser(vocab): + parser = DependencyParser(vocab) + parser.cfg['token_vector_width'] = 8 + parser.cfg['hidden_width'] = 30 + parser.cfg['hist_size'] = 0 + parser.add_label('left') + parser.begin_training([], **parser.cfg) + sgd = Adam(NumpyOps(), 0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=['a', 'b', 'c', 'd']) + gold = GoldParse(doc, heads=[1, 1, 3, 3], + deps=['left', 'ROOT', 'left', 'ROOT']) + parser.update([doc], [gold], sgd=sgd, losses=losses) + return parser + +def test_init_parser(parser): + pass + +# TODO: This is flakey, because it depends on what the parser first learns. +@pytest.mark.xfail +def test_add_label(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = parser(doc) + assert doc[0].head.i == 1 + assert doc[0].dep_ == 'left' + assert doc[1].head.i == 1 + assert doc[2].head.i == 3 + assert doc[2].head.i == 3 + parser.add_label('right') + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = parser(doc) + assert doc[0].head.i == 1 + assert doc[0].dep_ == 'left' + assert doc[1].head.i == 1 + assert doc[2].head.i == 3 + assert doc[2].head.i == 3 + sgd = Adam(NumpyOps(), 0.001) + for i in range(10): + losses = {} + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + gold = GoldParse(doc, heads=[1, 1, 3, 3], + deps=['right', 'ROOT', 'left', 'ROOT']) + parser.update([doc], [gold], sgd=sgd, losses=losses) + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = parser(doc) + assert doc[0].dep_ == 'right' + assert doc[2].dep_ == 'left' + diff --git a/spacy/tests/parser/test_beam_parse.py b/spacy/tests/parser/test_beam_parse.py index da5f43d5e..dd77c6805 100644 --- a/spacy/tests/parser/test_beam_parse.py +++ b/spacy/tests/parser/test_beam_parse.py @@ -1,10 +1,11 @@ -import spacy +# coding: utf8 +from __future__ import unicode_literals + import pytest -@pytest.mark.models -def test_beam_parse(): - nlp = spacy.load('en_core_web_sm') - doc = nlp(u'Australia is a country', disable=['ner']) - ents = nlp.entity(doc, beam_width=2) - print(ents) +@pytest.mark.models('en') +def test_beam_parse(EN): + doc = EN(u'Australia is a country', disable=['ner']) + ents = EN.entity(doc, beam_width=2) + print(ents) diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 30a6367c8..e85c61276 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -6,7 +6,7 @@ import numpy from ..._ml import chain, Tok2Vec, doc2feats from ...vocab import Vocab -from ...pipeline import TokenVectorEncoder +from ...pipeline import Tensorizer from ...syntax.arc_eager import ArcEager from ...syntax.nn_parser import Parser from ...tokens.doc import Doc @@ -26,7 +26,7 @@ def arc_eager(vocab): @pytest.fixture def tok2vec(): - return Tok2Vec(8, 100, preprocess=doc2feats()) + return Tok2Vec(8, 100) @pytest.fixture @@ -35,7 +35,8 @@ def parser(vocab, arc_eager): @pytest.fixture def model(arc_eager, tok2vec): - return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0] + return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO, + hist_size=0)[0] @pytest.fixture def doc(vocab): @@ -51,7 +52,7 @@ def test_can_init_nn_parser(parser): def test_build_model(parser): - parser.model = Parser.Model(parser.moves.n_moves)[0] + parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0] assert parser.model is not None @@ -61,33 +62,22 @@ def test_predict_doc(parser, tok2vec, model, doc): parser(doc) -def test_update_doc(parser, tok2vec, model, doc, gold): +def test_update_doc(parser, model, doc, gold): parser.model = model - tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) - d_tokvecs = parser.update(([doc], tokvecs), [gold]) - assert d_tokvecs[0].shape == tokvecs[0].shape def optimize(weights, gradient, key=None): weights -= 0.001 * gradient - bp_tokvecs(d_tokvecs, sgd=optimize) - assert d_tokvecs[0].sum() == 0. + parser.update([doc], [gold], sgd=optimize) -def test_predict_doc_beam(parser, tok2vec, model, doc): - doc.tensor = tok2vec([doc])[0] +def test_predict_doc_beam(parser, model, doc): parser.model = model parser(doc, beam_width=32, beam_density=0.001) - for word in doc: - print(word.text, word.head, word.dep_) -def test_update_doc_beam(parser, tok2vec, model, doc, gold): +def test_update_doc_beam(parser, model, doc, gold): parser.model = model - tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) - d_tokvecs = parser.update_beam(([doc], tokvecs), [gold]) - assert d_tokvecs[0].shape == tokvecs[0].shape def optimize(weights, gradient, key=None): weights -= 0.001 * gradient - bp_tokvecs(d_tokvecs, sgd=optimize) - assert d_tokvecs[0].sum() == 0. + parser.update_beam([doc], [gold], sgd=optimize) diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 4d909f0d6..da59b0b59 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) for head in doc: for child in head.lefts: - assert child.head is head + assert child.head == head for child in head.rights: - assert child.head is head + assert child.head == head def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py new file mode 100644 index 000000000..9b8c98735 --- /dev/null +++ b/spacy/tests/parser/test_preset_sbd.py @@ -0,0 +1,73 @@ +'''Test that the parser respects preset sentence boundaries.''' +from __future__ import unicode_literals +import pytest +from thinc.neural.optimizers import Adam +from thinc.neural.ops import NumpyOps + +from ...attrs import NORM +from ...gold import GoldParse +from ...vocab import Vocab +from ...tokens import Doc +from ...pipeline import DependencyParser + +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + +@pytest.fixture +def parser(vocab): + parser = DependencyParser(vocab) + parser.cfg['token_vector_width'] = 4 + parser.cfg['hidden_width'] = 32 + #parser.add_label('right') + parser.add_label('left') + parser.begin_training([], **parser.cfg) + sgd = Adam(NumpyOps(), 0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=['a', 'b', 'c', 'd']) + gold = GoldParse(doc, heads=[1, 1, 3, 3], + deps=['left', 'ROOT', 'left', 'ROOT']) + parser.update([doc], [gold], sgd=sgd, losses=losses) + return parser + +def test_no_sentences(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = parser(doc) + assert len(list(doc.sents)) >= 1 + + +def test_sents_1(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[2].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) >= 2 + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = False + doc[2].sent_start = True + doc[3].sent_start = False + doc = parser(doc) + assert len(list(doc.sents)) == 2 + + +def test_sents_1_2(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = True + doc[2].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) == 3 + + +def test_sents_1_3(parser): + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = True + doc[3].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) >= 3 + doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc[1].sent_start = True + doc[2].sent_start = False + doc[3].sent_start = True + doc = parser(doc) + assert len(list(doc.sents)) == 3 diff --git a/spacy/tests/parser/test_to_from_bytes_disk.py b/spacy/tests/parser/test_to_from_bytes_disk.py index b0a10fa8e..48c412b7a 100644 --- a/spacy/tests/parser/test_to_from_bytes_disk.py +++ b/spacy/tests/parser/test_to_from_bytes_disk.py @@ -1,11 +1,11 @@ import pytest -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser @pytest.fixture def parser(en_vocab): - parser = NeuralDependencyParser(en_vocab) + parser = DependencyParser(en_vocab) parser.add_label('nsubj') parser.model, cfg = parser.Model(parser.moves.n_moves) parser.cfg.update(cfg) @@ -14,7 +14,7 @@ def parser(en_vocab): @pytest.fixture def blank_parser(en_vocab): - parser = NeuralDependencyParser(en_vocab) + parser = DependencyParser(en_vocab) return parser diff --git a/spacy/tests/pipeline/__init__.py b/spacy/tests/pipeline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py new file mode 100644 index 000000000..dbcde3e5e --- /dev/null +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -0,0 +1,102 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ...language import Language + + +@pytest.fixture +def nlp(): + return Language() + + +def new_pipe(doc): + return doc + + +def test_add_pipe_no_name(nlp): + nlp.add_pipe(new_pipe) + assert 'new_pipe' in nlp.pipe_names + + +def test_add_pipe_duplicate_name(nlp): + nlp.add_pipe(new_pipe, name='duplicate_name') + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, name='duplicate_name') + + +@pytest.mark.parametrize('name', ['parser']) +def test_add_pipe_first(nlp, name): + nlp.add_pipe(new_pipe, name=name, first=True) + assert nlp.pipeline[0][0] == name + + +@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) +def test_add_pipe_last(nlp, name1, name2): + nlp.add_pipe(lambda doc: doc, name=name2) + nlp.add_pipe(new_pipe, name=name1, last=True) + assert nlp.pipeline[0][0] != name1 + assert nlp.pipeline[-1][0] == name1 + + +def test_cant_add_pipe_first_and_last(nlp): + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, first=True, last=True) + + +@pytest.mark.parametrize('name', ['my_component']) +def test_get_pipe(nlp, name): + with pytest.raises(KeyError): + nlp.get_pipe(name) + nlp.add_pipe(new_pipe, name=name) + assert nlp.get_pipe(name) == new_pipe + + +@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) +def test_replace_pipe(nlp, name, replacement): + with pytest.raises(ValueError): + nlp.replace_pipe(name, new_pipe) + nlp.add_pipe(new_pipe, name=name) + nlp.replace_pipe(name, replacement) + assert nlp.get_pipe(name) != new_pipe + assert nlp.get_pipe(name) == replacement + + +@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) +def test_rename_pipe(nlp, old_name, new_name): + with pytest.raises(ValueError): + nlp.rename_pipe(old_name, new_name) + nlp.add_pipe(new_pipe, name=old_name) + nlp.rename_pipe(old_name, new_name) + assert nlp.pipeline[0][0] == new_name + + +@pytest.mark.parametrize('name', ['my_component']) +def test_remove_pipe(nlp, name): + with pytest.raises(ValueError): + nlp.remove_pipe(name) + nlp.add_pipe(new_pipe, name=name) + assert len(nlp.pipeline) == 1 + removed_name, removed_component = nlp.remove_pipe(name) + assert not len(nlp.pipeline) + assert removed_name == name + assert removed_component == new_pipe + + +@pytest.mark.parametrize('name', ['my_component']) +def test_disable_pipes_method(nlp, name): + nlp.add_pipe(new_pipe, name=name) + assert nlp.has_pipe(name) + disabled = nlp.disable_pipes(name) + assert not nlp.has_pipe(name) + disabled.restore() + + +@pytest.mark.parametrize('name', ['my_component']) +def test_disable_pipes_context(nlp, name): + nlp.add_pipe(new_pipe, name=name) + assert nlp.has_pipe(name) + with nlp.disable_pipes(name): + assert not nlp.has_pipe(name) + assert nlp.has_pipe(name) diff --git a/spacy/tests/regression/test_issue1242.py b/spacy/tests/regression/test_issue1242.py new file mode 100644 index 000000000..50dc8c37e --- /dev/null +++ b/spacy/tests/regression/test_issue1242.py @@ -0,0 +1,23 @@ +from __future__ import unicode_literals +import pytest +from ...lang.en import English +from ...util import load_model + + +def test_issue1242_empty_strings(): + nlp = English() + doc = nlp('') + assert len(doc) == 0 + docs = list(nlp.pipe(['', 'hello'])) + assert len(docs[0]) == 0 + assert len(docs[1]) == 1 + + +@pytest.mark.models('en') +def test_issue1242_empty_strings_en_core_web_sm(): + nlp = load_model('en_core_web_sm') + doc = nlp('') + assert len(doc) == 0 + docs = list(nlp.pipe(['', 'hello'])) + assert len(docs[0]) == 0 + assert len(docs[1]) == 1 diff --git a/spacy/tests/regression/test_issue1250.py b/spacy/tests/regression/test_issue1250.py new file mode 100644 index 000000000..3b6e0bbf2 --- /dev/null +++ b/spacy/tests/regression/test_issue1250.py @@ -0,0 +1,13 @@ +from __future__ import unicode_literals +from ...tokenizer import Tokenizer +from ...symbols import ORTH, LEMMA, POS +from ...lang.en import English + +def test_issue1250_cached_special_cases(): + nlp = English() + nlp.tokenizer.add_special_case(u'reimbur', [{ORTH: u'reimbur', LEMMA: u'reimburse', POS: u'VERB'}]) + + lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')] + assert lemmas == ['reimburse', ',', 'reimburse', '...'] + lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')] + assert lemmas == ['reimburse', ',', 'reimburse', '...'] diff --git a/spacy/tests/regression/test_issue1253.py b/spacy/tests/regression/test_issue1253.py new file mode 100644 index 000000000..2fe77d6d8 --- /dev/null +++ b/spacy/tests/regression/test_issue1253.py @@ -0,0 +1,20 @@ +from __future__ import unicode_literals +import pytest +import spacy + + +def ss(tt): + for i in range(len(tt)-1): + for j in range(i+1, len(tt)): + tt[i:j].root + + +@pytest.mark.models('en') +def test_access_parse_for_merged(): + nlp = spacy.load('en_core_web_sm') + t_t = nlp.tokenizer("Highly rated - I'll definitely") + nlp.tagger(t_t) + nlp.parser(t_t) + nlp.parser(t_t) + ss(t_t) + diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py index e123ce0ba..342cdd081 100644 --- a/spacy/tests/regression/test_issue1305.py +++ b/spacy/tests/regression/test_issue1305.py @@ -1,8 +1,10 @@ import pytest +import spacy @pytest.mark.models('en') -def test_issue1305(EN): +def test_issue1305(): '''Test lemmatization of English VBZ''' - assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work']) - doc = EN(u'This app works well') + nlp = spacy.load('en_core_web_sm') + assert nlp.vocab.morphology.lemmatizer('works', 'verb') == ['work'] + doc = nlp(u'This app works well') assert doc[2].lemma_ == 'work' diff --git a/spacy/tests/regression/test_issue1375.py b/spacy/tests/regression/test_issue1375.py new file mode 100644 index 000000000..6f74d9a6d --- /dev/null +++ b/spacy/tests/regression/test_issue1375.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals +import pytest +from ...vocab import Vocab +from ...tokens.doc import Doc + + +def test_issue1375(): + '''Test that token.nbor() raises IndexError for out-of-bounds access.''' + doc = Doc(Vocab(), words=['0', '1', '2']) + with pytest.raises(IndexError): + assert doc[0].nbor(-1) + assert doc[1].nbor(-1).text == '0' + with pytest.raises(IndexError): + assert doc[2].nbor(1) + assert doc[1].nbor(1).text == '2' + diff --git a/spacy/tests/regression/test_issue1380.py b/spacy/tests/regression/test_issue1380.py new file mode 100644 index 000000000..b2d610954 --- /dev/null +++ b/spacy/tests/regression/test_issue1380.py @@ -0,0 +1,14 @@ +from __future__ import unicode_literals +import pytest + +from ...language import Language + +def test_issue1380_empty_string(): + nlp = Language() + doc = nlp('') + assert len(doc) == 0 + +@pytest.mark.models('en') +def test_issue1380_en(EN): + doc = EN('') + assert len(doc) == 0 diff --git a/spacy/tests/regression/test_issue1387.py b/spacy/tests/regression/test_issue1387.py new file mode 100644 index 000000000..4bd0092d0 --- /dev/null +++ b/spacy/tests/regression/test_issue1387.py @@ -0,0 +1,22 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...symbols import POS, VERB, VerbForm_part +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer +from ..util import get_doc + +import pytest + + +def test_issue1387(): + tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} + index = {"verb": ("cope","cop")} + exc = {"verb": {"coping": ("cope",)}} + rules = {"verb": [["ing", ""]]} + lemmatizer = Lemmatizer(index, exc, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, ["coping"]) + doc[0].tag_ = 'VBG' + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" diff --git a/spacy/tests/regression/test_issue1434.py b/spacy/tests/regression/test_issue1434.py new file mode 100644 index 000000000..fc88cc3e6 --- /dev/null +++ b/spacy/tests/regression/test_issue1434.py @@ -0,0 +1,22 @@ +from __future__ import unicode_literals + +from ...vocab import Vocab +from ...lang.lex_attrs import LEX_ATTRS +from ...tokens import Doc +from ...matcher import Matcher + + +def test_issue1434(): + '''Test matches occur when optional element at end of short doc''' + vocab = Vocab(lex_attr_getters=LEX_ATTRS) + hello_world = Doc(vocab, words=['Hello', 'World']) + hello = Doc(vocab, words=['Hello']) + + matcher = Matcher(vocab) + matcher.add('MyMatcher', None, + [ {'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'} ]) + + matches = matcher(hello_world) + assert matches + matches = matcher(hello) + assert matches diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py new file mode 100644 index 000000000..6f1d4f568 --- /dev/null +++ b/spacy/tests/regression/test_issue1450.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals +import pytest + +from ...matcher import Matcher +from ...tokens import Doc +from ...vocab import Vocab + + +@pytest.mark.parametrize( + 'string,start,end', + [ + ('a', 0, 1), + ('a b', 0, 2), + ('a c', 0, 1), + ('a b c', 0, 2), + ('a b b c', 0, 2), + ('a b b', 0, 2), + ] +) +def test_issue1450_matcher_end_zero_plus(string, start, end): + '''Test matcher works when patterns end with * operator. + + Original example (rewritten to avoid model usage) + + nlp = spacy.load('en_core_web_sm') + matcher = Matcher(nlp.vocab) + matcher.add( + "TSTEND", + on_match_1, + [ + {TAG: "JJ", LOWER: "new"}, + {TAG: "NN", 'OP': "*"} + ] + ) + doc = nlp(u'Could you create a new ticket for me?') + print([(w.tag_, w.text, w.lower_) for w in doc]) + matches = matcher(doc) + print(matches) + assert len(matches) == 1 + assert matches[0][1] == 4 + assert matches[0][2] == 5 + ''' + matcher = Matcher(Vocab()) + matcher.add( + "TSTEND", + None, + [ + {'ORTH': "a"}, + {'ORTH': "b", 'OP': "*"} + ] + ) + doc = Doc(Vocab(), words=string.split()) + matches = matcher(doc) + if start is None or end is None: + assert matches == [] + + assert matches[0][1] == start + assert matches[0][2] == end diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py index 74f12bd9f..4804225ac 100644 --- a/spacy/tests/regression/test_issue429.py +++ b/spacy/tests/regression/test_issue429.py @@ -22,7 +22,6 @@ def test_issue429(EN): matcher = Matcher(EN.vocab) matcher.add('TEST', merge_phrases, [{'ORTH': 'a'}]) doc = EN.make_doc('a b c') - EN.tensorizer(doc) EN.tagger(doc) matcher(doc) EN.entity(doc) diff --git a/spacy/tests/regression/test_issue589.py b/spacy/tests/regression/test_issue589.py index 27363739d..96ea4be61 100644 --- a/spacy/tests/regression/test_issue589.py +++ b/spacy/tests/regression/test_issue589.py @@ -7,6 +7,7 @@ from ..util import get_doc import pytest +@pytest.mark.xfail def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py index e3f391a37..2c77e68cd 100644 --- a/spacy/tests/regression/test_issue781.py +++ b/spacy/tests/regression/test_issue781.py @@ -9,4 +9,4 @@ import pytest @pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) def test_issue781(EN, word, lemmas): lemmatizer = EN.Defaults.create_lemmatizer() - assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas) + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == lemmas diff --git a/spacy/tests/serialize/test_serialize_empty_model.py b/spacy/tests/serialize/test_serialize_empty_model.py new file mode 100644 index 000000000..b614a3648 --- /dev/null +++ b/spacy/tests/serialize/test_serialize_empty_model.py @@ -0,0 +1,9 @@ +import spacy +import spacy.lang.en +from spacy.pipeline import TextCategorizer + +def test_bytes_serialize_issue_1105(): + nlp = spacy.lang.en.English() + tokenizer = nlp.tokenizer + textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) + textcat_bytes = textcat.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py new file mode 100644 index 000000000..8919ebe1e --- /dev/null +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -0,0 +1,27 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...tokens.doc import Doc +from ...vocab import Vocab + + +@pytest.fixture +def doc_w_attrs(en_tokenizer): + Doc.set_extension('_test_attr', default=False) + Doc.set_extension('_test_prop', getter=lambda doc: len(doc.text)) + Doc.set_extension('_test_method', method=lambda doc, arg: "{}{}".format(len(doc.text), arg)) + doc = en_tokenizer("This is a test.") + doc._._test_attr = 'test' + return doc + + + +def test_serialize_ext_attrs_from_bytes(doc_w_attrs): + doc_b = doc_w_attrs.to_bytes() + doc = Doc(Vocab()).from_bytes(doc_b) + assert doc._.has('_test_attr') + assert doc._._test_attr == 'test' + assert doc._._test_prop == len(doc.text) + assert doc._._test_method('test') == '{}{}'.format(len(doc.text), 'test') diff --git a/spacy/tests/serialize/test_serialize_parser_ner.py b/spacy/tests/serialize/test_serialize_parser_ner.py index ae9e23e9a..cbe97b716 100644 --- a/spacy/tests/serialize/test_serialize_parser_ner.py +++ b/spacy/tests/serialize/test_serialize_parser_ner.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import NeuralDependencyParser as DependencyParser -from ...pipeline import NeuralEntityRecognizer as EntityRecognizer +from ...pipeline import DependencyParser +from ...pipeline import EntityRecognizer import pytest diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py index 3154687c3..7b7dedae0 100644 --- a/spacy/tests/serialize/test_serialize_tagger.py +++ b/spacy/tests/serialize/test_serialize_tagger.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import NeuralTagger as Tagger +from ...pipeline import Tagger import pytest @@ -11,7 +11,7 @@ import pytest def taggers(en_vocab): tagger1 = Tagger(en_vocab) tagger2 = Tagger(en_vocab) - tagger1.model = tagger1.Model(8, 8) + tagger1.model = tagger1.Model(8) tagger2.model = tagger1.model return (tagger1, tagger2) diff --git a/spacy/tests/serialize/test_serialize_tensorizer.py b/spacy/tests/serialize/test_serialize_tensorizer.py index ba01a2fa6..bc751a686 100644 --- a/spacy/tests/serialize/test_serialize_tensorizer.py +++ b/spacy/tests/serialize/test_serialize_tensorizer.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import TokenVectorEncoder as Tensorizer +from ...pipeline import Tensorizer import pytest diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 7ed9333b8..4050809b5 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -55,6 +55,17 @@ def test_spans_span_sent(doc): assert doc[6:7].sent.root.left_edge.text == 'This' +def test_spans_lca_matrix(en_tokenizer): + """Test span's lca matrix generation""" + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc[:2].get_lca_matrix() + assert(lca[0, 0] == 0) + assert(lca[0, 1] == -1) + assert(lca[1, 0] == -1) + assert(lca[1, 1] == 1) + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" @@ -89,7 +100,7 @@ def test_spans_are_hashable(en_tokenizer): assert hash(span1) != hash(span2) span3 = tokens[0:2] assert hash(span3) == hash(span1) - + def test_spans_by_character(doc): span1 = doc[1:-2] @@ -106,3 +117,9 @@ def test_span_to_array(doc): assert arr[0, 0] == span[0].orth assert arr[0, 1] == len(span[0]) + +@pytest.mark.xfail +def test_span_as_doc(doc): + span = doc[4:10] + span_doc = span.as_doc() + assert span.text == span_doc.text diff --git a/spacy/tests/stringstore/test_stringstore.py b/spacy/tests/stringstore/test_stringstore.py index 65b994606..3f2992a6f 100644 --- a/spacy/tests/stringstore/test_stringstore.py +++ b/spacy/tests/stringstore/test_stringstore.py @@ -6,6 +6,16 @@ from ...strings import StringStore import pytest +def test_string_hash(stringstore): + '''Test that string hashing is stable across platforms''' + ss = stringstore + assert ss.add('apple') == 8566208034543834098 + heart = '\U0001f499' + print(heart) + h = ss.add(heart) + assert h == 11841826740069053588 + + def test_stringstore_from_api_docs(stringstore): apple_hash = stringstore.add('apple') assert apple_hash == 8566208034543834098 diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 388aab03e..8210467ea 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ..matcher import Matcher, PhraseMatcher from .util import get_doc +from ..tokens import Doc import pytest @@ -34,7 +35,6 @@ def test_matcher_from_api_docs(en_vocab): assert len(patterns[0]) -@pytest.mark.xfail def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" doc = get_doc(en_vocab, words=text.split(' ')) @@ -46,7 +46,8 @@ def test_matcher_from_usage_docs(en_vocab): if doc.vocab.strings[match_id] == 'HAPPY': doc.sentiment += 0.1 span = doc[start : end] - token = span.merge(norm='happy emoji') + token = span.merge() + token.vocab[token.text].norm_ = 'happy emoji' matcher = Matcher(en_vocab) matcher.add('HAPPY', label_sentiment, *pos_patterns) @@ -63,6 +64,12 @@ def test_matcher_init(en_vocab, words): assert matcher(doc) == [] +def test_matcher_contains(matcher): + matcher.add('TEST', None, [{'ORTH': 'test'}]) + assert 'TEST' in matcher + assert 'TEST2' not in matcher + + def test_matcher_no_match(matcher): words = ["I", "like", "cheese", "."] doc = get_doc(matcher.vocab, words) @@ -98,16 +105,59 @@ def test_matcher_match_multi(matcher): (doc.vocab.strings['Java'], 5, 6)] -@pytest.mark.xfail +def test_matcher_empty_dict(en_vocab): + '''Test matcher allows empty token specs, meaning match on any token.''' + matcher = Matcher(en_vocab) + abc = ["a", "b", "c"] + doc = get_doc(matcher.vocab, abc) + matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}]) + matches = matcher(doc) + assert len(matches) == 1 + assert matches[0][1:] == (0, 3) + matcher = Matcher(en_vocab) + matcher.add('A.', None, [{'ORTH': 'a'}, {}]) + matches = matcher(doc) + assert matches[0][1:] == (0, 2) + + +def test_matcher_operator_shadow(en_vocab): + matcher = Matcher(en_vocab) + abc = ["a", "b", "c"] + doc = get_doc(matcher.vocab, abc) + matcher.add('A.C', None, [{'ORTH': 'a'}, + {"IS_ALPHA": True, "OP": "+"}, + {'ORTH': 'c'}]) + matches = matcher(doc) + assert len(matches) == 1 + assert matches[0][1:] == (0, 3) + + def test_matcher_phrase_matcher(en_vocab): words = ["Google", "Now"] doc = get_doc(en_vocab, words) - matcher = PhraseMatcher(en_vocab, [doc]) + matcher = PhraseMatcher(en_vocab) + matcher.add('COMPANY', None, doc) words = ["I", "like", "Google", "Now", "best"] doc = get_doc(en_vocab, words) assert len(matcher(doc)) == 1 +def test_phrase_matcher_length(en_vocab): + matcher = PhraseMatcher(en_vocab) + assert len(matcher) == 0 + matcher.add('TEST', None, get_doc(en_vocab, ['test'])) + assert len(matcher) == 1 + matcher.add('TEST2', None, get_doc(en_vocab, ['test2'])) + assert len(matcher) == 2 + + +def test_phrase_matcher_contains(en_vocab): + matcher = PhraseMatcher(en_vocab) + matcher.add('TEST', None, get_doc(en_vocab, ['test'])) + assert 'TEST' in matcher + assert 'TEST2' not in matcher + + def test_matcher_match_zero(matcher): words1 = 'He said , " some words " ...'.split() words2 = 'He said , " some three words " ...'.split() @@ -151,3 +201,60 @@ def test_matcher_match_one_plus(matcher): {'ORTH': 'Philippe', 'OP': '+'}]) m = matcher(doc) assert len(m) == 1 + + +def test_operator_combos(matcher): + cases = [ + ('aaab', 'a a a b', True), + ('aaab', 'a+ b', True), + ('aaab', 'a+ a+ b', True), + ('aaab', 'a+ a+ a b', True), + ('aaab', 'a+ a+ a+ b', True), + ('aaab', 'a+ a a b', True), + ('aaab', 'a+ a a', True), + ('aaab', 'a+', True), + ('aaa', 'a+ b', False), + ('aaa', 'a+ a+ b', False), + ('aaa', 'a+ a+ a+ b', False), + ('aaa', 'a+ a b', False), + ('aaa', 'a+ a a b', False), + ('aaab', 'a+ a a', True), + ('aaab', 'a+', True), + ('aaab', 'a+ a b', True), + ] + for string, pattern_str, result in cases: + matcher = Matcher(matcher.vocab) + doc = get_doc(matcher.vocab, words=list(string)) + pattern = [] + for part in pattern_str.split(): + if part.endswith('+'): + pattern.append({'ORTH': part[0], 'op': '+'}) + else: + pattern.append({'ORTH': part}) + matcher.add('PATTERN', None, pattern) + matches = matcher(doc) + if result: + assert matches, (string, pattern_str) + else: + assert not matches, (string, pattern_str) + + +def test_matcher_end_zero_plus(matcher): + '''Test matcher works when patterns end with * operator. (issue 1450)''' + matcher = Matcher(matcher.vocab) + matcher.add( + "TSTEND", + None, + [ + {'ORTH': "a"}, + {'ORTH': "b", 'OP': "*"} + ] + ) + nlp = lambda string: Doc(matcher.vocab, words=string.split()) + assert len(matcher(nlp(u'a'))) == 1 + assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a c'))) == 1 + assert len(matcher(nlp(u'a b c'))) == 1 + assert len(matcher(nlp(u'a b b c'))) == 1 + assert len(matcher(nlp(u'a b b'))) == 1 diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 80b859c70..762ea4c08 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -9,7 +9,8 @@ from .util import get_doc from pathlib import Path import pytest -from thinc.neural import Maxout, Softmax +from thinc.neural._classes.maxout import Maxout +from thinc.neural._classes.softmax import Softmax from thinc.api import chain diff --git a/spacy/tests/test_underscore.py b/spacy/tests/test_underscore.py new file mode 100644 index 000000000..c7df57b62 --- /dev/null +++ b/spacy/tests/test_underscore.py @@ -0,0 +1,53 @@ +from mock import Mock +from ..tokens.underscore import Underscore + + +def test_create_doc_underscore(): + doc = Mock() + doc.doc = doc + uscore = Underscore(Underscore.doc_extensions, doc) + assert uscore._doc is doc + assert uscore._start is None + assert uscore._end is None + + +def test_doc_underscore_getattr_setattr(): + doc = Mock() + doc.doc = doc + doc.user_data = {} + Underscore.doc_extensions['hello'] = (False, None, None, None) + doc._ = Underscore(Underscore.doc_extensions, doc) + assert doc._.hello == False + doc._.hello = True + assert doc._.hello == True + + +def test_create_span_underscore(): + span = Mock(doc=Mock(), start=0, end=2) + uscore = Underscore(Underscore.span_extensions, span, + start=span.start, end=span.end) + assert uscore._doc is span.doc + assert uscore._start is span.start + assert uscore._end is span.end + + +def test_span_underscore_getter_setter(): + span = Mock(doc=Mock(), start=0, end=2) + Underscore.span_extensions['hello'] = (None, None, + lambda s: (s.start, 'hi'), + lambda s, value: setattr(s, 'start', + value)) + span._ = Underscore(Underscore.span_extensions, span, + start=span.start, end=span.end) + + assert span._.hello == (0, 'hi') + span._.hello = 1 + assert span._.hello == (1, 'hi') + + +def test_token_underscore_method(): + token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese') + Underscore.token_extensions['hello'] = (None, token.say_cheese, + None, None) + token._ = Underscore(Underscore.token_extensions, token, start=token.idx) + assert token._.hello() == 'cheese' diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 57281b998..132f27433 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import sys import pytest @@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length - @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), ('i💙you', 3), ('🤘🤘yay!', 4)]) def test_tokenizer_handles_emoji(tokenizer, text, length): - tokens = tokenizer(text) - assert len(tokens) == length + # These break on narrow unicode builds, e.g. Windows + if sys.maxunicode >= 1114111: + tokens = tokenizer(text) + assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 959067110..3bb6521f1 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", - "http://⌘.ws", - "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(✪)_in_parens", "http://foo.com/(something)?after=parens", - "http://☺.damowmow.com/", "http://code.google.com/events/#&product=browser", "http://j.mp", "ftp://foo.bar/baz", @@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [ "http://a.b-c.de", "http://223.255.255.254", "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 - "http://✪df.ws/123", - "http://➡.ws/䨹", - "http://مثال.إختبار", - "http://例子.测试", - "http://उदाहरण.परीक्षा", pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), + pytest.mark.xfail("http://⌘.ws"), + pytest.mark.xfail("http://⌘.ws/"), + pytest.mark.xfail("http://☺.damowmow.com/"), + pytest.mark.xfail("http://✪df.ws/123"), + pytest.mark.xfail("http://➡.ws/䨹"), + pytest.mark.xfail("http://مثال.إختبار"), + pytest.mark.xfail("http://例子.测试"), + pytest.mark.xfail("http://उदाहरण.परीक्षा"), ] URLS_SHOULD_NOT_MATCH = [ @@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", - "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://10.1.1.0", @@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [ pytest.mark.xfail("foo.com"), pytest.mark.xfail("http://1.1.1.1.1"), pytest.mark.xfail("http://www.foo.bar./"), + pytest.mark.xfail("http://-a.b.co"), ] diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py index 798871edd..74ac26a10 100644 --- a/spacy/tests/vectors/test_vectors.py +++ b/spacy/tests/vectors/test_vectors.py @@ -35,18 +35,18 @@ def vocab(en_vocab, vectors): def test_init_vectors_with_data(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) assert v.shape == data.shape def test_init_vectors_with_width(strings): - v = Vectors(strings, 3) + v = Vectors(strings, width=3) for string in strings: v.add(string) assert v.shape == (len(strings), 3) def test_get_vector(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(data[0]) @@ -56,7 +56,7 @@ def test_get_vector(strings, data): def test_set_vector(strings, data): orig = data.copy() - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(orig[0]) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 1a3e86b49..919b0928b 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -27,8 +27,9 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) + vector[LexemeC*] *suffixes, int* has_special) cdef int _attach_tokens(self, Doc tokens, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, + int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index de184baba..ef31a5d5c 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -8,19 +8,19 @@ from cython.operator cimport preincrement as preinc from cymem.cymem cimport Pool from preshed.maps cimport PreshMap import regex as re - -from .strings cimport hash_string -from . import util cimport cython from .tokens.doc cimport Doc +from .strings cimport hash_string +from . import util cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. """ - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): + def __init__(self, Vocab vocab, rules=None, prefix_search=None, + suffix_search=None, infix_finditer=None, token_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -48,8 +48,9 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - for chunk, substrings in sorted(rules.items()): - self.add_special_case(chunk, substrings) + if rules is not None: + for chunk, substrings in sorted(rules.items()): + self.add_special_case(chunk, substrings) def __reduce__(self): args = (self.vocab, @@ -61,11 +62,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): + # TODO: deprecation warning return Doc(self.vocab, words=strings) - #raise NotImplementedError( - # "Method deprecated in 1.0.\n" - # "Old: tokenizer.tokens_from_list(strings)\n" - # "New: Doc(tokenizer.vocab, words=strings)") @cython.boundscheck(False) def __call__(self, unicode string): @@ -75,13 +73,12 @@ cdef class Tokenizer: RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): - raise ValueError( - "String is too long: %d characters. Max is 2**30." % len(string) - ) + msg = "String is too long: %d characters. Max is 2**30." + raise ValueError(msg % len(string)) cdef int length = len(string) - cdef Doc tokens = Doc(self.vocab) + cdef Doc doc = Doc(self.vocab) if length == 0: - return tokens + return doc cdef int i = 0 cdef int start = 0 cdef bint cache_hit @@ -100,11 +97,11 @@ cdef class Tokenizer: # we don't have to create the slice when we hit the cache. span = string[start:i] key = hash_string(span) - cache_hit = self._try_cache(key, tokens) + cache_hit = self._try_cache(key, doc) if not cache_hit: - self._tokenize(tokens, span, key) + self._tokenize(doc, span, key) if uc == ' ': - tokens.c[tokens.length - 1].spacy = True + doc.c[doc.length - 1].spacy = True start = i + 1 else: start = i @@ -113,18 +110,18 @@ cdef class Tokenizer: if start < i: span = string[start:] key = hash_string(span) - cache_hit = self._try_cache(key, tokens) + cache_hit = self._try_cache(key, doc) if not cache_hit: - self._tokenize(tokens, span, key) - tokens.c[tokens.length - 1].spacy = string[-1] == ' ' and not in_ws - return tokens + self._tokenize(doc, span, key) + doc.c[doc.length - 1].spacy = string[-1] == ' ' and not in_ws + return doc def pipe(self, texts, batch_size=1000, n_threads=2): """Tokenize a stream of texts. texts: A sequence of unicode texts. - batch_size (int): The number of texts to accumulate in an internal buffer. - n_threads (int): The number of threads to use, if the implementation + batch_size (int): Number of texts to accumulate in an internal buffer. + n_threads (int): Number of threads to use, if the implementation supports multi-threading. The default tokenizer is single-threaded. YIELDS (Doc): A sequence of Doc objects, in order. """ @@ -148,14 +145,18 @@ cdef class Tokenizer: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size + cdef int has_special orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, + &has_special) self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.c[orig_size], orig_key, has_special, + tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes): + vector[const LexemeC*] *suffixes, + int* has_special): cdef size_t i cdef unicode prefix cdef unicode suffix @@ -174,6 +175,7 @@ cdef class Tokenizer: if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) + has_special[0] = 1 break if self.token_match and self.token_match(string): break @@ -185,6 +187,7 @@ cdef class Tokenizer: if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) + has_special[0] = 1 break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] @@ -197,6 +200,7 @@ cdef class Tokenizer: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) if string and (self._specials.get(hash_string(string)) != NULL): + has_special[0] = 1 break return string @@ -226,8 +230,8 @@ cdef class Tokenizer: if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) else: - # let's say we have dyn-o-mite-dave - # the regex finds the start and end positions of the hyphens + # let's say we have dyn-o-mite-dave - the regex finds the + # start and end positions of the hyphens start = 0 for match in matches: infix_start = match.start() @@ -248,18 +252,23 @@ cdef class Tokenizer: start = infix_end span = string[start:] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + if span: + tokens.push_back(self.vocab.get(tokens.mem, span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) preinc(it) tokens.push_back(lexeme, False) - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: + cdef int _save_cached(self, const TokenC* tokens, hash_t key, + int has_special, int n) except -1: cdef int i for i in range(n): if tokens[i].lex.id == 0: return 0 + # See https://github.com/explosion/spaCy/issues/1250 + if has_special: + return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n cached.is_lex = True @@ -282,8 +291,8 @@ cdef class Tokenizer: return list(self.infix_finditer(string)) def find_prefix(self, unicode string): - """Find the length of a prefix that should be segmented from the string, - or None if no prefix rules match. + """Find the length of a prefix that should be segmented from the + string, or None if no prefix rules match. string (unicode): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. @@ -294,8 +303,8 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def find_suffix(self, unicode string): - """Find the length of a suffix that should be segmented from the string, - or None if no suffix rules match. + """Find the length of a suffix that should be segmented from the + string, or None if no suffix rules match. string (unicode): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. @@ -315,8 +324,8 @@ cdef class Tokenizer: string (unicode): The string to specially tokenize. token_attrs (iterable): A sequence of dicts, where each dict describes - a token and its attributes. The `ORTH` fields of the attributes must - exactly match the string when they are concatenated. + a token and its attributes. The `ORTH` fields of the attributes + must exactly match the string when they are concatenated. """ substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) @@ -332,7 +341,7 @@ cdef class Tokenizer: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ with path.open('wb') as file_: file_.write(self.to_bytes(**exclude)) diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index bc3794126..b4815abd2 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -2,4 +2,4 @@ from .doc import Doc from .token import Token from .span import Span -__all__ = [Doc, Token, Span] +__all__ = ['Doc', 'Token', 'Span'] diff --git a/spacy/tokens/binder.pyx b/spacy/tokens/binder.pyx deleted file mode 100644 index 0ee168579..000000000 --- a/spacy/tokens/binder.pyx +++ /dev/null @@ -1,21 +0,0 @@ -cdef class Binder: - def __init__(self, *docs): - pass - - def __iter__(self): - pass - - def __reduce__(self): - pass - - def to_bytes(self): - pass - - def from_bytes(cls, data): - pass - - def to_disk(self): - pass - - def from_disk(self, path): - pass diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index d0c83e0f8..f34c455c6 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -54,7 +54,9 @@ cdef class Doc: cdef public object noun_chunks_iterator - cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 + cdef object __weakref__ + + cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3b1f38b68..7a2e95e4b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -9,6 +9,7 @@ import numpy import numpy.linalg import struct import dill +import msgpack from libc.string cimport memcpy, memset from libc.math cimport sqrt @@ -20,17 +21,17 @@ from .token cimport Token from .printers import parse_tree from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t -from ..attrs import intify_attrs +from ..attrs import intify_attrs, IDS from ..attrs cimport attr_id_t -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER -from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..attrs cimport SENT_START +from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER +from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB +from ..attrs cimport ENT_TYPE, SENT_START from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..util import normalize_slice -from ..compat import is_config +from ..compat import is_config, copy_reg, pickle from .. import about from .. import util - +from .underscore import Underscore DEF PADDING = 5 @@ -64,6 +65,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: else: return Lexeme.get_struct_attr(token.lex, feat_name) + def _get_chunker(lang): try: cls = util.get_lang_class(lang) @@ -73,30 +75,49 @@ def _get_chunker(lang): return None return cls.Defaults.syntax_iterators.get(u'noun_chunks') + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export - annotations to numpy arrays, losslessly serialize to compressed binary strings. - The `Doc` object holds an array of `TokenC` structs. The Python-level - `Token` and `Span` objects are views of this array, i.e. they don't own - the data themselves. + annotations to numpy arrays, losslessly serialize to compressed binary + strings. The `Doc` object holds an array of `TokenC` structs. The + Python-level `Token` and `Span` objects are views of this array, i.e. + they don't own the data themselves. EXAMPLE: Construction 1 >>> doc = nlp(u'Some text') Construction 2 >>> from spacy.tokens import Doc - >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) + >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], + spaces=[True, False, False]) """ - def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + nr_defined = sum(t is not None for t in (default, getter, setter, method)) + assert nr_defined == 1 + Underscore.doc_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.doc_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.doc_extensions + + def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None, + orths_and_spaces=None): """Create a Doc object. - vocab (Vocab): A vocabulary object, which must match any models you want - to use (e.g. tokenizer, parser, entity recognizer). + vocab (Vocab): A vocabulary object, which must match any models you + want to use (e.g. tokenizer, parser, entity recognizer). words (list or None): A list of unicode strings to add to the document as words. If `None`, defaults to empty list. spaces (list or None): A list of boolean values, of the same length as words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` + user_data (dict or None): Optional extra data to attach to the Doc. RETURNS (Doc): The newly constructed object. """ self.vocab = vocab @@ -122,8 +143,7 @@ cdef class Doc: self.user_token_hooks = {} self.user_span_hooks = {} self.tensor = numpy.zeros((0,), dtype='float32') - self.user_data = {} - self._py_tokens = [] + self.user_data = {} if user_data is None else user_data self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) cdef unicode orth @@ -133,10 +153,10 @@ cdef class Doc: spaces = [True] * len(words) elif len(spaces) != len(words): raise ValueError( - "Arguments 'words' and 'spaces' should be sequences of the " - "same length, or 'spaces' should be left default at None. " - "spaces should be a sequence of booleans, with True meaning " - "that the word owns a ' ' character following it.") + "Arguments 'words' and 'spaces' should be sequences of " + "the same length, or 'spaces' should be left default at " + "None. spaces should be a sequence of booleans, with True " + "meaning that the word owns a ' ' character following it.") orths_and_spaces = zip(words, spaces) if orths_and_spaces is not None: for orth_space in orths_and_spaces: @@ -146,7 +166,8 @@ cdef class Doc: elif isinstance(orth_space, bytes): raise ValueError( "orths_and_spaces expects either List(unicode) or " - "List((unicode, bool)). Got bytes instance: %s" % (str(orth_space))) + "List((unicode, bool)). " + "Got bytes instance: %s" % (str(orth_space))) else: orth, has_space = orth_space # Note that we pass self.mem here --- we have ownership, if LexemeC @@ -159,10 +180,15 @@ cdef class Doc: self.is_tagged = True self.is_parsed = True + @property + def _(self): + return Underscore(Underscore.doc_extensions, self) + def __getitem__(self, object i): """Get a `Token` or `Span` object. - i (int or tuple) The index of the token, or the slice of the document to get. + i (int or tuple) The index of the token, or the slice of the document + to get. RETURNS (Token or Span): The token at `doc[i]]`, or the span at `doc[start : end]`. @@ -175,11 +201,11 @@ cdef class Doc: >>> doc[start : end]] Get a `Span` object, starting at position `start` and ending at position `end`, where `start` and `end` are token indices. For - instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. - Stepped slices (e.g. `doc[start : end : step]`) are not supported, - as `Span` objects must be contiguous (cannot have gaps). You can use - negative indices and open-ended ranges, which have their normal - Python semantics. + instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and + 4. Stepped slices (e.g. `doc[start : end : step]`) are not + supported, as `Span` objects must be contiguous (cannot have gaps). + You can use negative indices and open-ended ranges, which have + their normal Python semantics. """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) @@ -188,10 +214,7 @@ cdef class Doc: if i < 0: i = self.length + i bounds_check(i, self.length, PADDING) - if self._py_tokens[i] is not None: - return self._py_tokens[i] - else: - return Token.cinit(self.vocab, &self.c[i], i, self) + return Token.cinit(self.vocab, &self.c[i], i, self) def __iter__(self): """Iterate over `Token` objects, from which the annotations can be @@ -205,10 +228,7 @@ cdef class Doc: """ cdef int i for i in range(self.length): - if self._py_tokens[i] is not None: - yield self._py_tokens[i] - else: - yield Token.cinit(self.vocab, &self.c[i], i, self) + yield Token.cinit(self.vocab, &self.c[i], i, self) def __len__(self): """The number of tokens in the document. @@ -244,8 +264,10 @@ cdef class Doc: doc (Doc): The parent document. start (int): The index of the first character of the span. end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for named entities. - vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + label (uint64 or string): A label to attach to the Span, e.g. for + named entities. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of + the span. RETURNS (Span): The newly constructed object. """ if not isinstance(label, int): @@ -304,7 +326,8 @@ cdef class Doc: if self._vector is not None: return self._vector elif not len(self): - self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') + self._vector = numpy.zeros((self.vocab.vectors_length,), + dtype='f') return self._vector elif self.has_vector: vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') @@ -316,7 +339,8 @@ cdef class Doc: self._vector = self.tensor.mean(axis=0) return self._vector else: - return numpy.zeros((self.vocab.vectors_length,), dtype='float32') + return numpy.zeros((self.vocab.vectors_length,), + dtype='float32') def __set__(self, value): self._vector = value @@ -359,13 +383,14 @@ cdef class Doc: return self.text property ents: - """Iterate over the entities in the document. Yields named-entity `Span` - objects, if the entity recognizer has been applied to the document. + """Iterate over the entities in the document. Yields named-entity + `Span` objects, if the entity recognizer has been applied to the + document. YIELDS (Span): Entities in the document. - EXAMPLE: Iterate over the span to get individual Token objects, or access - the label: + EXAMPLE: Iterate over the span to get individual Token objects, + or access the label: >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') >>> ents = list(tokens.ents) @@ -401,7 +426,8 @@ cdef class Doc: def __set__(self, ents): # TODO: # 1. Allow negative matches - # 2. Ensure pre-set NERs are not over-written during statistical prediction + # 2. Ensure pre-set NERs are not over-written during statistical + # prediction # 3. Test basic data-driven ORTH gazetteer # 4. Test more nuanced date and currency regex cdef int i @@ -410,7 +436,7 @@ cdef class Doc: # At this point we don't know whether the NER has run over the # Doc. If the ent_iob is missing, leave it missing. if self.c[i].ent_iob != 0: - self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. + self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. cdef attr_t ent_type cdef int start, end for ent_info in ents: @@ -438,10 +464,11 @@ cdef class Doc: property noun_chunks: """Iterate over the base noun phrases in the document. Yields base - noun-phrase #[code Span] objects, if the document has been syntactically - parsed. A base noun phrase, or "NP chunk", is a noun phrase that does - not permit other NPs to be nested within it – so no NP-level - coordination, no prepositional phrases, and no relative clauses. + noun-phrase #[code Span] objects, if the document has been + syntactically parsed. A base noun phrase, or "NP chunk", is a noun + phrase that does not permit other NPs to be nested within it – so no + NP-level coordination, no prepositional phrases, and no relative + clauses. YIELDS (Span): Noun chunks in the document. """ @@ -449,12 +476,14 @@ cdef class Doc: if not self.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "requires a statistical model to be installed and loaded. " + "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) - # Accumulate the result before beginning to iterate over it. This prevents - # the tokenisation from being changed out from under us during the iteration. - # The tricky thing here is that Span accepts its tokenisation changing, - # so it's okay once we have the Span objects. See Issue #375 + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) @@ -479,13 +508,14 @@ cdef class Doc: if not self.is_parsed: raise ValueError( - "sentence boundary detection requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "Sentence boundary detection requires the dependency " + "parse, which requires a statistical model to be " + "installed and loaded. For more info, see the " "documentation: \n%s\n" % about.__docs_models__) cdef int i start = 0 for i in range(1, self.length): - if self.c[i].sent_start: + if self.c[i].sent_start == 1: yield Span(self, start, i) start = i if start != self.length: @@ -512,16 +542,20 @@ cdef class Doc: assert t.lex.orth != 0 t.spacy = has_space self.length += 1 - self._py_tokens.append(None) + # Set morphological attributes, e.g. by lemma, if possible + self.vocab.morphology.assign_untagged(t) return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): - """Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape `(N, M)`, where `N` is the length of the document. - The values will be 32-bit integers. + """Export given token attributes to a numpy `ndarray`. + If `attr_ids` is a sequence of M attributes, the output array will be + of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If + `attr_ids` is a single attribute, the output shape will be (N,). You + can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or + string name (e.g. 'LEMMA' or 'lemma'). - attr_ids (list[int]): A list of attribute ID ints. + attr_ids (list[]): A list of attributes (int IDs or string names). RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. @@ -534,17 +568,28 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature + cdef np.ndarray[attr_t, ndim=1] attr_ids cdef np.ndarray[attr_t, ndim=2] output - # Make an array from the attributes --- otherwise our inner loop is Python - # dict iteration. - cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) - output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + # Handle scalar/list inputs of strings/ints for py_attr_ids + if not hasattr(py_attr_ids, '__iter__'): + py_attr_ids = [py_attr_ids] + + # Allow strings, e.g. 'lemma' or 'LEMMA' + py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) + for id_ in py_attr_ids] + # Make an array from the attributes --- otherwise our inner loop is + # Python dict iteration. + attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), + dtype=numpy.uint64) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.c[i], feature) - return output + # Handle 1d case + return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, + PreshCounter counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -608,13 +653,12 @@ cdef class Doc: def from_array(self, attrs, array): if SENT_START in attrs and HEAD in attrs: raise ValueError( - "Conflicting attributes specified in doc.from_array():\n" + "Conflicting attributes specified in doc.from_array(): " "(HEAD, SENT_START)\n" - "The HEAD attribute currently sets sentence boundaries implicitly,\n" - "based on the tree structure. This means the HEAD attribute would " - "potentially override the sentence boundaries set by SENT_START.\n" - "See https://github.com/spacy-io/spaCy/issues/235 for details and " - "workarounds, and to propose solutions.") + "The HEAD attribute currently sets sentence boundaries " + "implicitly, based on the tree structure. This means the HEAD " + "attribute would potentially override the sentence boundaries " + "set by SENT_START.") cdef int i, col cdef attr_id_t attr_id cdef TokenC* tokens = self.c @@ -641,11 +685,55 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + def get_lca_matrix(self): + """Calculates the lowest common ancestor matrix for a given `Doc`. + Returns LCA matrix containing the integer index of the ancestor, or -1 + if no common ancestor is found (ex if span excludes a necessary + ancestor). Apologies about the recursion, but the impact on + performance is negligible given the natural limitations on the depth + of a typical human sentence. + """ + # Efficiency notes: + # We can easily improve the performance here by iterating in Cython. + # To loop over the tokens in Cython, the easiest way is: + # for token in doc.c[:doc.c.length]: + # head = token + token.head + # Both token and head will be TokenC* here. The token.head attribute + # is an integer offset. + def __pairwise_lca(token_j, token_k, lca_matrix): + if lca_matrix[token_j.i][token_k.i] != -2: + return lca_matrix[token_j.i][token_k.i] + elif token_j == token_k: + lca_index = token_j.i + elif token_k.head == token_j: + lca_index = token_j.i + elif token_j.head == token_k: + lca_index = token_k.i + elif (token_j.head == token_j) and (token_k.head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j.head, token_k.head, + lca_matrix) + lca_matrix[token_j.i][token_k.i] = lca_index + lca_matrix[token_k.i][token_j.i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + for j in range(len(self)): + token_j = self[j] + for k in range(j, len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) + lca_matrix[k][j] = lca_matrix[j][k] + return lca_matrix + def to_disk(self, path, **exclude): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ with path.open('wb') as file_: file_.write(self.to_bytes(**exclude)) @@ -660,7 +748,7 @@ cdef class Doc: """ with path.open('rb') as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **exclude) + return self.from_bytes(bytes_data, **exclude) def to_bytes(self, **exclude): """Serialize, i.e. export the document contents to a binary string. @@ -668,15 +756,23 @@ cdef class Doc: RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. """ - array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE] + array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE] + # Msgpack doesn't distinguish between lists and tuples, which is + # vexing for user data. As a best guess, we *know* that within + # keys, we must have tuples. In values we just have to hope + # users don't mind getting a list instead of a tuple. serializers = { 'text': lambda: self.text, 'array_head': lambda: array_head, 'array_body': lambda: self.to_array(array_head), 'sentiment': lambda: self.sentiment, 'tensor': lambda: self.tensor, - 'user_data': lambda: self.user_data } + if 'user_data' not in exclude and self.user_data: + user_data_keys, user_data_values = list(zip(*self.user_data.items())) + serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys) + serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values) + return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, **exclude): @@ -693,10 +789,21 @@ cdef class Doc: 'array_body': lambda b: None, 'sentiment': lambda b: None, 'tensor': lambda b: None, - 'user_data': lambda user_data: self.user_data.update(user_data) + 'user_data_keys': lambda b: None, + 'user_data_values': lambda b: None, } msg = util.from_bytes(bytes_data, deserializers, exclude) + # Msgpack doesn't distinguish between lists and tuples, which is + # vexing for user data. As a best guess, we *know* that within + # keys, we must have tuples. In values we just have to hope + # users don't mind getting a list instead of a tuple. + if 'user_data' not in exclude and 'user_data_keys' in msg: + user_data_keys = msgpack.loads(msg['user_data_keys'], + use_list=False) + user_data_values = msgpack.loads(msg['user_data_values']) + for key, value in zip(user_data_keys, user_data_values): + self.user_data[key] = value cdef attr_t[:, :] attrs cdef int i, start, end, has_space @@ -720,14 +827,15 @@ cdef class Doc: return self def merge(self, int start_idx, int end_idx, *args, **attributes): - """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` - is merged into a single token. If `start_idx` and `end_idx `do not mark - start and end token boundaries, the document remains unchanged. + """Retokenize the document, such that the span at + `doc.text[start_idx : end_idx]` is merged into a single token. If + `start_idx` and `end_idx `do not mark start and end token boundaries, + the document remains unchanged. - start_idx (int): The character index of the start of the slice to merge. - end_idx (int): The character index after the end of the slice to merge. + start_idx (int): Character index of the start of the slice to merge. + end_idx (int): Character index after the end of the slice to merge. **attributes: Attributes to assign to the merged token. By default, - attributes are inherited from the syntactic root token of the span. + attributes are inherited from the syntactic root of the span. RETURNS (Token): The newly merged token, or `None` if the start and end indices did not fall at token boundaries. """ @@ -748,10 +856,11 @@ cdef class Doc: attributes[ENT_TYPE] = attributes['ent_type'] elif args: raise ValueError( - "Doc.merge received %d non-keyword arguments. " - "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " + "Doc.merge received %d non-keyword arguments. Expected either " + "3 arguments (deprecated), or 0 (use keyword arguments). " "Arguments supplied:\n%s\n" - "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) + "Keyword arguments: %s\n" % (len(args), repr(args), + repr(attributes))) # More deprecated attribute handling =/ if 'label' in attributes: @@ -783,8 +892,9 @@ cdef class Doc: Token.set_struct_attr(token, attr_name, attr_value) # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets - # Before thinking of something simpler, beware the case where a dependency - # bridges over the entity. Here the alignment of the tokens changes. + # Before thinking of something simpler, beware the case where a + # dependency bridges over the entity. Here the alignment of the + # tokens changes. span_root = span.root.i token.dep = span.root.dep # We update token.lex after keeping span root and dep, since @@ -818,7 +928,6 @@ cdef class Doc: # Set the left/right children, left/right edges set_children_from_heads(self.c, self.length) # Clear the cached Python objects - self._py_tokens = [None] * self.length # Return the merged Python object return self[start] @@ -834,8 +943,9 @@ cdef class Doc: >>> trees = doc.print_tree() >>> trees[1] {'modifiers': [ - {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', - 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, + {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', + 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', + 'lemma': 'Alice'}, {'modifiers': [ {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], @@ -900,3 +1010,23 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: if tokens[i].head == 0 and tokens[i].dep != 0: tokens[tokens[i].l_edge].sent_start = True + +def pickle_doc(doc): + bytes_data = doc.to_bytes(vocab=False, user_data=False) + hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, + doc.user_token_hooks) + return (unpickle_doc, (doc.vocab, dill.dumps(hooks_and_data), bytes_data)) + + +def unpickle_doc(vocab, hooks_and_data, bytes_data): + user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data) + + doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, + exclude='user_data') + doc.user_hooks.update(doc_hooks) + doc.user_span_hooks.update(span_hooks) + doc.user_token_hooks.update(token_hooks) + return doc + + +copy_reg.pickle(Doc, pickle_doc, unpickle_doc) diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index 4bc7099d7..92b2cd84c 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -43,8 +43,8 @@ def POS_tree(root, light=False, flat=False): def parse_tree(doc, light=False, flat=False): - """Makes a copy of the doc, then construct a syntactic parse tree, similar to - the one used in displaCy. Generates the POS tree for all sentences in a doc. + """Make a copy of the doc and construct a syntactic parse tree similar to + displaCy. Generates the POS tree for all sentences in a doc. doc (Doc): The doc for parsing. RETURNS (dict): The parse tree. @@ -66,8 +66,9 @@ def parse_tree(doc, light=False, flat=False): 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'} """ - doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) + doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) merge_ents(doc_clone) # merge the entities into single tokens first - return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] + return [POS_tree(sent.root, light=light, flat=flat) + for sent in doc_clone.sents] diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7e29cccf4..efe511089 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -17,19 +17,34 @@ from ..attrs cimport IS_PUNCT, IS_SPACE from ..lexeme cimport Lexeme from ..compat import is_config from .. import about +from .underscore import Underscore cdef class Span: """A slice from a Doc object.""" - def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, - vector_norm=None): + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + Underscore.span_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.span_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.span_extensions + + def __cinit__(self, Doc doc, int start, int end, attr_t label=0, + vector=None, vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. doc (Doc): The parent document. start (int): The index of the first token of the span. end (int): The index of the first token after the span. label (uint64): A label to attach to the Span, e.g. for named entities. - vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation + of the span. RETURNS (Span): The newly constructed object. """ if not (0 <= start <= end <= len(doc)): @@ -111,6 +126,38 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] + @property + def _(self): + """User space for adding custom attribute extensions.""" + return Underscore(Underscore.span_extensions, self, + start=self.start_char, end=self.end_char) + + def as_doc(self): + # TODO: fix + """Create a `Doc` object view of the Span's data. This is mostly + useful for C-typed interfaces. + + RETURNS (Doc): The `Doc` view of the span. + """ + cdef Doc doc = Doc(self.doc.vocab) + doc.length = self.end-self.start + doc.c = &self.doc.c[self.start] + doc.mem = self.doc.mem + doc.is_parsed = self.doc.is_parsed + doc.is_tagged = self.doc.is_tagged + doc.noun_chunks_iterator = self.doc.noun_chunks_iterator + doc.user_hooks = self.doc.user_hooks + doc.user_span_hooks = self.doc.user_span_hooks + doc.user_token_hooks = self.doc.user_token_hooks + doc.vector = self.vector + doc.vector_norm = self.vector_norm + for key, value in self.doc.cats.items(): + if hasattr(key, '__len__') and len(key) == 3: + cat_start, cat_end, cat_label = key + if cat_start == self.start_char and cat_end == self.end_char: + doc.cats[cat_label] = value + return doc + def merge(self, *args, **attributes): """Retokenize the document, such that the span is merged into a single token. @@ -119,7 +166,8 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ - return self.doc.merge(self.start_char, self.end_char, *args, **attributes) + return self.doc.merge(self.start_char, self.end_char, *args, + **attributes) def similarity(self, other): """Make a semantic similarity estimate. The default estimate is cosine @@ -135,6 +183,47 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def get_lca_matrix(self): + """Calculates the lowest common ancestor matrix for a given `Span`. + Returns LCA matrix containing the integer index of the ancestor, or -1 + if no common ancestor is found (ex if span excludes a necessary + ancestor). Apologies about the recursion, but the impact on + performance is negligible given the natural limitations on the depth + of a typical human sentence. + """ + def __pairwise_lca(token_j, token_k, lca_matrix, margins): + offset = margins[0] + token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k + token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j + token_j_i = token_j.i - offset + token_k_i = token_k.i - offset + if lca_matrix[token_j_i][token_k_i] != -2: + return lca_matrix[token_j_i][token_k_i] + elif token_j == token_k: + lca_index = token_j_i + elif token_k_head == token_j: + lca_index = token_j_i + elif token_j_head == token_k: + lca_index = token_k_i + elif (token_j_head == token_j) and (token_k_head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) + lca_matrix[token_j_i][token_k_i] = lca_index + lca_matrix[token_k_i][token_j_i] = lca_index + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + margins = [self.start, self.end] + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) + lca_matrix[k][j] = lca_matrix[j][k] + return lca_matrix + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -173,10 +262,7 @@ cdef class Span: self.end = end + 1 property sent: - """The sentence span that this span is a part of. - - RETURNS (Span): The sentence span that the span is a part of. - """ + """RETURNS (Span): The sentence span that the span is a part of.""" def __get__(self): if 'sent' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sent'](self) @@ -189,13 +275,10 @@ cdef class Span: n += 1 if n >= self.doc.length: raise RuntimeError - return self.doc[root.l_edge : root.r_edge + 1] + return self.doc[root.l_edge:root.r_edge + 1] property has_vector: - """A boolean value indicating whether a word vector is associated with - the object. - - RETURNS (bool): Whether a word vector is associated with the object. + """RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): if 'has_vector' in self.doc.user_span_hooks: @@ -217,10 +300,7 @@ cdef class Span: return self._vector property vector_norm: - """The L2 norm of the document's vector representation. - - RETURNS (float): The L2 norm of the vector representation. - """ + """RETURNS (float): The L2 norm of the vector representation.""" def __get__(self): if 'vector_norm' in self.doc.user_span_hooks: return self.doc.user_span_hooks['vector'](self) @@ -234,7 +314,9 @@ cdef class Span: return self._vector_norm property sentiment: - # TODO: docstring + """RETURNS (float): A scalar value indicating the positivity or + negativity of the span. + """ def __get__(self): if 'sentiment' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sentiment'](self) @@ -242,10 +324,7 @@ cdef class Span: return sum([token.sentiment for token in self]) / len(self) property text: - """A unicode representation of the span text. - - RETURNS (unicode): The original verbatim text of the span. - """ + """RETURNS (unicode): The original verbatim text of the span.""" def __get__(self): text = self.text_with_ws if self[-1].whitespace_: @@ -256,7 +335,8 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing whitespace). + RETURNS (unicode): The text content of the span (with trailing + whitespace). """ def __get__(self): return u''.join([t.text_with_ws for t in self]) @@ -265,7 +345,8 @@ cdef class Span: """Yields base noun-phrase `Span` objects, if the document has been syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no - NP-level coordination, no prepositional phrases, and no relative clauses. + NP-level coordination, no prepositional phrases, and no relative + clauses. YIELDS (Span): Base noun-phrase `Span` objects """ @@ -273,12 +354,14 @@ cdef class Span: if not self.doc.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "requires a statistical model to be installed and loaded. " + "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) - # Accumulate the result before beginning to iterate over it. This prevents - # the tokenisation from being changed out from under us during the iteration. - # The tricky thing here is that Span accepts its tokenisation changing, - # so it's okay once we have the Span objects. See Issue #375 + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375 spans = [] cdef attr_t label for start, end, label in self.doc.noun_chunks_iterator(self): @@ -292,9 +375,9 @@ cdef class Span: RETURNS (Token): The root token. - EXAMPLE: The root token has the shortest path to the root of the sentence - (or is the root itself). If multiple words are equally high in the - tree, the first word is taken. For example: + EXAMPLE: The root token has the shortest path to the root of the + sentence (or is the root itself). If multiple words are equally + high in the tree, the first word is taken. For example: >>> toks = nlp(u'I like New York in Autumn.') @@ -344,11 +427,11 @@ cdef class Span: if self.doc.c[i].head == 0: return self.doc[i] # If we don't have a sentence root, we do something that's not so - # algorithmically clever, but I think should be quite fast, especially - # for short spans. + # algorithmically clever, but I think should be quite fast, + # especially for short spans. # For each word, we count the path length, and arg min this measure. - # We could use better tree logic to save steps here...But I think this - # should be okay. + # We could use better tree logic to save steps here...But I + # think this should be okay. cdef int current_best = self.doc.length cdef int root = -1 for i in range(self.start, self.end): @@ -370,7 +453,7 @@ cdef class Span: YIELDS (Token):A left-child of a token of the span. """ def __get__(self): - for token in reversed(self): # Reverse, so we get the tokens in order + for token in reversed(self): # Reverse, so we get tokens in order for left in token.lefts: if left.i < self.start: yield left @@ -387,6 +470,22 @@ cdef class Span: if right.i >= self.end: yield right + property n_lefts: + """RETURNS (int): The number of leftward immediate children of the + span, in the syntactic dependency parse. + """ + # TODO: implement + def __get__(self): + raise NotImplementedError + + property n_rights: + """RETURNS (int): The number of rightward immediate children of the + span, in the syntactic dependency parse. + """ + # TODO: implement + def __get__(self): + raise NotImplementedError + property subtree: """Tokens that descend from tokens in the span, but fall outside it. @@ -400,66 +499,55 @@ cdef class Span: yield from word.subtree property ent_id: - """An (integer) entity ID. Usually assigned by patterns in the `Matcher`. - - RETURNS (uint64): The entity ID. - """ + """RETURNS (uint64): The entity ID.""" def __get__(self): return self.root.ent_id def __set__(self, hash_t key): - # TODO raise NotImplementedError( - "Can't yet set ent_id from Span. Vote for this feature on the issue " - "tracker: http://github.com/explosion/spaCy/issues") + "Can't yet set ent_id from Span. Vote for this feature on " + "the issue tracker: http://github.com/explosion/spaCy/issues") property ent_id_: - """A (string) entity ID. Usually assigned by patterns in the `Matcher`. - - RETURNS (unicode): The entity ID. - """ + """RETURNS (unicode): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ def __set__(self, hash_t key): - # TODO raise NotImplementedError( - "Can't yet set ent_id_ from Span. Vote for this feature on the issue " - "tracker: http://github.com/explosion/spaCy/issues") + "Can't yet set ent_id_ from Span. Vote for this feature on the " + "issue tracker: http://github.com/explosion/spaCy/issues") property orth_: - # TODO: docstring + """Verbatim text content (identical to Span.text). Exists mostly for + consistency with other attributes. + + RETURNS (unicode): The span's text.""" def __get__(self): - return ''.join([t.string for t in self]).strip() + return ''.join([t.orth_ for t in self]).strip() property lemma_: - """The span's lemma. - - RETURNS (unicode): The span's lemma. - """ + """RETURNS (unicode): The span's lemma.""" def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() property upper_: - # TODO: docstring + """Deprecated. Use Span.text.upper() instead.""" def __get__(self): - return ''.join([t.string.upper() for t in self]).strip() + return ''.join([t.text_with_ws.upper() for t in self]).strip() property lower_: - # TODO: docstring + """Deprecated. Use Span.text.lower() instead.""" def __get__(self): - return ''.join([t.string.lower() for t in self]).strip() + return ''.join([t.text_with_ws.lower() for t in self]).strip() property string: - # TODO: docstring + """Deprecated: Use Span.text_with_ws instead.""" def __get__(self): - return ''.join([t.string for t in self]) + return ''.join([t.text_with_ws for t in self]) property label_: - """The span's label. - - RETURNS (unicode): The span's label. - """ + """RETURNS (unicode): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -477,7 +565,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: n += 1 if n >= sent_length: raise RuntimeError( - "Array bounds exceeded while searching for root word. This likely " - "means the parse tree is in an invalid state. Please report this " - "issue here: http://github.com/explosion/spaCy/issues") + "Array bounds exceeded while searching for root word. This " + "likely means the parse tree is in an invalid state. Please " + "report this issue here: " + "http://github.com/explosion/spaCy/issues") return n diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index f63a0490c..b408e04eb 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -19,10 +19,7 @@ cdef class Token: if offset < 0 or offset >= doc.length: msg = "Attempt to access token at %d, max length %d" raise IndexError(msg % (offset, doc.length)) - if doc._py_tokens[offset] != None: - return doc._py_tokens[offset] cdef Token self = Token.__new__(Token, vocab, doc, offset) - doc._py_tokens[offset] = self return self #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 7b11d6efa..fa07d0e9e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -14,16 +14,31 @@ from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from .. import parts_of_speech from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE -from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV -from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER -from ..attrs cimport LEMMA, POS, TAG, DEP +from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT +from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL +from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX +from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..compat import is_config from .. import about +from .underscore import Underscore cdef class Token: - """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" + """An individual token – i.e. a word, punctuation symbol, whitespace, + etc.""" + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + Underscore.token_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.span_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.span_extensions + def __cinit__(self, Vocab vocab, Doc doc, int offset): """Construct a `Token` object. @@ -87,6 +102,11 @@ cdef class Token: else: raise ValueError(op) + @property + def _(self): + return Underscore(Underscore.token_extensions, self, + start=self.idx, end=None) + cpdef bint check_flag(self, attr_id_t flag_id) except -1: """Check the value of a boolean flag. @@ -108,6 +128,9 @@ cdef class Token: i (int): The relative position of the token to get. Defaults to 1. RETURNS (Token): The token at position `self.doc[self.i+i]`. """ + if self.i+i < 0 or (self.i+i >= len(self.doc)): + msg = "Error accessing doc[%d].nbor(%d), for doc of length %d" + raise IndexError(msg % (self.i, i, len(self.doc))) return self.doc[self.i+i] def similarity(self, other): @@ -122,37 +145,33 @@ cdef class Token: return self.doc.user_token_hooks['similarity'](self) if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 - return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + return (numpy.dot(self.vector, other.vector) / + (self.vector_norm * other.vector_norm)) property lex_id: - """ID of the token's lexical type. - - RETURNS (int): ID of the token's lexical type.""" + """RETURNS (int): Sequential ID of the token's lexical type.""" def __get__(self): return self.c.lex.id property rank: - # TODO: add docstring + """RETURNS (int): Sequential ID of the token's lexical type, used to + index into tables, e.g. for word vectors.""" def __get__(self): return self.c.lex.id property string: + """Deprecated: Use Token.text_with_ws instead.""" def __get__(self): return self.text_with_ws property text: - """A unicode representation of the token text. - - RETURNS (unicode): The original verbatim text of the token. - """ + """RETURNS (unicode): The original verbatim text of the token.""" def __get__(self): return self.orth_ property text_with_ws: - """The text content of the token with a trailing whitespace character if - it has one. - - RETURNS (unicode): The text content of the span (with trailing whitespace). + """RETURNS (unicode): The text content of the span (with trailing + whitespace). """ def __get__(self): cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -162,74 +181,104 @@ cdef class Token: return orth property prob: + """RETURNS (float): Smoothed log probability estimate of token type.""" def __get__(self): return self.c.lex.prob property sentiment: + """RETURNS (float): A scalar value indicating the positivity or + negativity of the token.""" def __get__(self): if 'sentiment' in self.doc.user_token_hooks: return self.doc.user_token_hooks['sentiment'](self) return self.c.lex.sentiment property lang: + """RETURNS (uint64): ID of the language of the parent document's + vocabulary. + """ def __get__(self): return self.c.lex.lang property idx: + """RETURNS (int): The character offset of the token within the parent + document. + """ def __get__(self): return self.c.idx property cluster: + """RETURNS (int): Brown cluster ID.""" def __get__(self): return self.c.lex.cluster property orth: + """RETURNS (uint64): ID of the verbatim text content.""" def __get__(self): return self.c.lex.orth property lower: + """RETURNS (uint64): ID of the lowercase token text.""" def __get__(self): return self.c.lex.lower property norm: + """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of + the token text. Usually set in the language's tokenizer exceptions + or norm exceptions. + """ def __get__(self): return self.c.lex.norm property shape: + """RETURNS (uint64): ID of the token's shape, a transform of the + tokens's string, to show orthographic features (e.g. "Xxxx", "dd"). + """ def __get__(self): return self.c.lex.shape property prefix: + """RETURNS (uint64): ID of a length-N substring from the start of the + token. Defaults to `N=1`. + """ def __get__(self): return self.c.lex.prefix property suffix: + """RETURNS (uint64): ID of a length-N substring from the end of the + token. Defaults to `N=3`. + """ def __get__(self): return self.c.lex.suffix property lemma: - """Base form of the word, with no inflectional suffixes. - - RETURNS (uint64): Token lemma. + """RETURNS (uint64): ID of the base form of the word, with no + inflectional suffixes. """ def __get__(self): return self.c.lemma + def __set__(self, attr_t lemma): self.c.lemma = lemma property pos: + """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" def __get__(self): return self.c.pos property tag: + """RETURNS (uint64): ID of fine-grained part-of-speech tag.""" def __get__(self): return self.c.tag + def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) property dep: + """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self): return self.c.dep + def __set__(self, attr_t label): self.c.dep = label @@ -266,35 +315,50 @@ cdef class Token: def __get__(self): if 'vector_norm' in self.doc.user_token_hooks: return self.doc.user_token_hooks['vector_norm'](self) - vector = self.vector + vector = self.vector return numpy.sqrt((vector ** 2).sum()) property n_lefts: + """RETURNS (int): The number of leftward immediate children of the + word, in the syntactic dependency parse. + """ def __get__(self): return self.c.l_kids property n_rights: + """RETURNS (int): The number of rightward immediate children of the + word, in the syntactic dependency parse. + """ def __get__(self): return self.c.r_kids property sent_start: + # TODO: fix and document def __get__(self): return self.c.sent_start - def __set__(self, bint value): + def __set__(self, value): if self.doc.is_parsed: raise ValueError( - 'Refusing to write to token.sent_start if its document is parsed, ' - 'because this may cause inconsistent state. ' - 'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.') - self.c.sent_start = value + "Refusing to write to token.sent_start if its document " + "is parsed, because this may cause inconsistent state.") + if value is None: + self.c.sent_start = 0 + elif value is True: + self.c.sent_start = 1 + elif value is False: + self.c.sent_start = -1 + else: + raise ValueError("Invalid value for token.sent_start. Must be " + "one of: None, True, False") property lefts: + """The leftward immediate children of the word, in the syntactic + dependency parse. + + YIELDS (Token): A left-child of the token. + """ def __get__(self): - """ - The leftward immediate children of the word, in the syntactic - dependency parse. - """ cdef int nr_iter = 0 cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: @@ -304,15 +368,16 @@ cdef class Token: nr_iter += 1 # This is ugly, but it's a way to guard out infinite loops if nr_iter >= 10000000: - raise RuntimeError( - "Possibly infinite loop encountered while looking for token.lefts") + raise RuntimeError("Possibly infinite loop encountered " + "while looking for token.lefts") property rights: + """The rightward immediate children of the word, in the syntactic + dependency parse. + + YIELDS (Token): A right-child of the token. + """ def __get__(self): - """ - The rightward immediate children of the word, in the syntactic - dependency parse. - """ cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] cdef int nr_iter = 0 @@ -322,27 +387,26 @@ cdef class Token: ptr -= 1 nr_iter += 1 if nr_iter >= 10000000: - raise RuntimeError( - "Possibly infinite loop encountered while looking for token.rights") + raise RuntimeError("Possibly infinite loop encountered " + "while looking for token.rights") tokens.reverse() for t in tokens: yield t property children: - """ - A sequence of the token's immediate syntactic children. + """A sequence of the token's immediate syntactic children. - Yields: Token A child token such that child.head==self + YIELDS (Token): A child token such that child.head==self """ def __get__(self): yield from self.lefts yield from self.rights property subtree: - """ - A sequence of all the token's syntactic descendents. + """A sequence of all the token's syntactic descendents. - Yields: Token A descendent token such that self.is_ancestor(descendent) + YIELDS (Token): A descendent token such that + `self.is_ancestor(descendent)`. """ def __get__(self): for word in self.lefts: @@ -392,18 +456,17 @@ cdef class Token: """ if self.doc is not descendant.doc: return False - return any( ancestor.i == self.i for ancestor in descendant.ancestors ) + return any(ancestor.i == self.i for ancestor in descendant.ancestors) property head: """The syntactic parent, or "governor", of this token. - RETURNS (Token): The token head. + RETURNS (Token): The token predicted by the parser to be the head of + the current token. """ def __get__(self): - """The token predicted by the parser to be the head of the current - token. - """ return self.doc[self.i + self.c.head] + def __set__(self, Token new_head): # this function sets the head of self to new_head # and updates the counters for left/right dependents @@ -423,16 +486,18 @@ cdef class Token: cdef Token anc, child # update number of deps of old head - if self.c.head > 0: # left dependent + if self.c.head > 0: # left dependent old_head.c.l_kids -= 1 if self.c.l_edge == old_head.c.l_edge: - # the token dominates the left edge so the left edge of the head - # may change when the token is reattached - # it may not change if the new head is a descendant of the current head + # the token dominates the left edge so the left edge of + # the head may change when the token is reattached, it may + # not change if the new head is a descendant of the current + # head new_edge = self.c.l_edge - # the new l_edge is the left-most l_edge on any of the other dependents - # where the l_edge is left of the head, otherwise it is the head + # the new l_edge is the left-most l_edge on any of the + # other dependents where the l_edge is left of the head, + # otherwise it is the head if not is_desc: new_edge = old_head.i for child in old_head.children: @@ -442,14 +507,15 @@ cdef class Token: new_edge = child.c.l_edge old_head.c.l_edge = new_edge - # walk up the tree from old_head and assign new l_edge to ancestors - # until an ancestor already has an l_edge that's further left + # walk up the tree from old_head and assign new l_edge to + # ancestors until an ancestor already has an l_edge that's + # further left for anc in old_head.ancestors: if anc.c.l_edge <= new_edge: break anc.c.l_edge = new_edge - elif self.c.head < 0: # right dependent + elif self.c.head < 0: # right dependent old_head.c.r_kids -= 1 # do the same thing as for l_edge if self.c.r_edge == old_head.c.r_edge: @@ -470,7 +536,7 @@ cdef class Token: anc.c.r_edge = new_edge # update number of deps of new head - if rel_newhead_i > 0: # left dependent + if rel_newhead_i > 0: # left dependent new_head.c.l_kids += 1 # walk up the tree from new head and set l_edge to self.l_edge # until you hit a token with an l_edge further to the left @@ -481,7 +547,7 @@ cdef class Token: break anc.c.l_edge = self.c.l_edge - elif rel_newhead_i < 0: # right dependent + elif rel_newhead_i < 0: # right dependent new_head.c.r_kids += 1 # do the same as for l_edge if self.c.r_edge > new_head.c.r_edge: @@ -512,12 +578,10 @@ cdef class Token: yield from word.conjuncts property ent_type: - """Named entity type. - - RETURNS (uint64): Named entity type. - """ + """RETURNS (uint64): Named entity type.""" def __get__(self): return self.c.ent_type + def __set__(self, ent_type): self.c.ent_type = ent_type @@ -531,19 +595,17 @@ cdef class Token: return self.c.ent_iob property ent_type_: - """Named entity type. - - RETURNS (unicode): Named entity type. - """ + """RETURNS (unicode): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] + def __set__(self, ent_type): self.c.ent_type = self.vocab.strings.add(ent_type) property ent_iob_: """IOB code of named entity tag. "B" means the token begins an entity, - "I" means it is inside an entity, "O" means it is outside an entity, and - "" means no entity tag is set. + "I" means it is inside an entity, "O" means it is outside an entity, + and "" means no entity tag is set. RETURNS (unicode): IOB code of named entity tag. """ @@ -552,10 +614,8 @@ cdef class Token: return iob_strings[self.c.ent_iob] property ent_id: - """ID of the entity the token is an instance of, if any. Usually - assigned by patterns in the Matcher. - - RETURNS (uint64): ID of the entity. + """RETURNS (uint64): ID of the entity the token is an instance of, + if any. """ def __get__(self): return self.c.ent_id @@ -564,10 +624,8 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """ID of the entity the token is an instance of, if any. Usually - assigned by patterns in the Matcher. - - RETURNS (unicode): ID of the entity. + """RETURNS (unicode): ID of the entity the token is an instance of, + if any. """ def __get__(self): return self.vocab.strings[self.c.ent_id] @@ -576,107 +634,192 @@ cdef class Token: self.c.ent_id = self.vocab.strings.add(name) property whitespace_: + """RETURNS (unicode): The trailing whitespace character, if present. + """ def __get__(self): return ' ' if self.c.spacy else '' property orth_: + """RETURNS (unicode): Verbatim text content (identical to + `Token.text`). Existst mostly for consistency with the other + attributes. + """ def __get__(self): return self.vocab.strings[self.c.lex.orth] property lower_: + """RETURNS (unicode): The lowercase token text. Equivalent to + `Token.text.lower()`. + """ def __get__(self): return self.vocab.strings[self.c.lex.lower] property norm_: + """RETURNS (unicode): The token's norm, i.e. a normalised form of the + token text. Usually set in the language's tokenizer exceptions or + norm exceptions. + """ def __get__(self): return self.vocab.strings[self.c.lex.norm] property shape_: + """RETURNS (unicode): Transform of the tokens's string, to show + orthographic features. For example, "Xxxx" or "dd". + """ def __get__(self): return self.vocab.strings[self.c.lex.shape] property prefix_: + """RETURNS (unicode): A length-N substring from the start of the token. + Defaults to `N=1`. + """ def __get__(self): return self.vocab.strings[self.c.lex.prefix] property suffix_: + """RETURNS (unicode): A length-N substring from the end of the token. + Defaults to `N=3`. + """ def __get__(self): return self.vocab.strings[self.c.lex.suffix] property lang_: + """RETURNS (unicode): Language of the parent document's vocabulary, + e.g. 'en'. + """ def __get__(self): return self.vocab.strings[self.c.lex.lang] property lemma_: - """Base form of the word, with no inflectional suffixes. - - RETURNS (unicode): Token lemma. + """RETURNS (unicode): The token lemma, i.e. the base form of the word, + with no inflectional suffixes. """ def __get__(self): return self.vocab.strings[self.c.lemma] + def __set__(self, unicode lemma_): self.c.lemma = self.vocab.strings.add(lemma_) property pos_: + """RETURNS (unicode): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] property tag_: + """RETURNS (unicode): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] + def __set__(self, tag): self.tag = self.vocab.strings.add(tag) property dep_: + """RETURNS (unicode): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] + def __set__(self, unicode label): self.c.dep = self.vocab.strings.add(label) property is_oov: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) + """RETURNS (bool): Whether the token is out-of-vocabulary.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_OOV) property is_stop: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP) + """RETURNS (bool): Whether the token is a stop word, i.e. part of a + "stop list" defined by the language data. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_STOP) property is_alpha: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) + """RETURNS (bool): Whether the token consists of alpha characters. + Equivalent to `token.text.isalpha()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII) + """RETURNS (bool): Whether the token consists of ASCII characters. + Equivalent to `[any(ord(c) >= 128 for c in token.text)]`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) + """RETURNS (bool): Whether the token consists of digits. Equivalent to + `token.text.isdigit()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + """RETURNS (bool): Whether the token is in lowercase. Equivalent to + `token.text.islower()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + + property is_upper: + """RETURNS (bool): Whether the token is in uppercase. Equivalent to + `token.text.isupper()` + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_UPPER) property is_title: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE) + """RETURNS (bool): Whether the token is in titlecase. Equivalent to + `token.text.istitle()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) + """RETURNS (bool): Whether the token is punctuation.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) + """RETURNS (bool): Whether the token consists of whitespace characters. + Equivalent to `token.text.isspace()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_SPACE) property is_bracket: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) + """RETURNS (bool): Whether the token is a bracket.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) property is_quote: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) + """RETURNS (bool): Whether the token is a quotation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) property is_left_punct: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) + """RETURNS (bool): Whether the token is a left punctuation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) property is_right_punct: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) + """RETURNS (bool): Whether the token is a left punctuation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) + """RETURNS (bool): Whether the token resembles a URL.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) + """RETURNS (bool): Whether the token resembles a number, e.g. "10.9", + "10", "ten", etc. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) + """RETURNS (bool): Whether the token resembles an email address.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py new file mode 100644 index 000000000..d80f50685 --- /dev/null +++ b/spacy/tokens/underscore.py @@ -0,0 +1,54 @@ +# coding: utf8 +from __future__ import unicode_literals + +import functools + + +class Underscore(object): + doc_extensions = {} + span_extensions = {} + token_extensions = {} + + def __init__(self, extensions, obj, start=None, end=None): + object.__setattr__(self, '_extensions', extensions) + object.__setattr__(self, '_obj', obj) + # Assumption is that for doc values, _start and _end will both be None + # Span will set non-None values for _start and _end + # Token will have _start be non-None, _end be None + # This lets us key everything into the doc.user_data dictionary, + # (see _get_key), and lets us use a single Underscore class. + object.__setattr__(self, '_doc', obj.doc) + object.__setattr__(self, '_start', start) + object.__setattr__(self, '_end', end) + + def __getattr__(self, name): + if name not in self._extensions: + raise AttributeError(name) + default, method, getter, setter = self._extensions[name] + if getter is not None: + return getter(self._obj) + elif method is not None: + return functools.partial(method, self._obj) + else: + return self._doc.user_data.get(self._get_key(name), default) + + def __setattr__(self, name, value): + if name not in self._extensions: + raise AttributeError(name) + default, method, getter, setter = self._extensions[name] + if setter is not None: + return setter(self._obj, value) + else: + self._doc.user_data[self._get_key(name)] = value + + def set(self, name, value): + return self.__setattr__(name, value) + + def get(self, name): + return self.__getattr__(name) + + def has(self, name): + return name in self._extensions + + def _get_key(self, name): + return ('._.', name, self._start, self._end) diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx index 8b1378917..e69de29bb 100644 --- a/spacy/typedefs.pyx +++ b/spacy/typedefs.pyx @@ -1 +0,0 @@ - diff --git a/spacy/util.py b/spacy/util.py index 95fcb493d..a45d43c47 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -3,30 +3,34 @@ from __future__ import unicode_literals, print_function import os import ujson -import pip +import pkg_resources import importlib import regex as re from pathlib import Path import sys import textwrap import random -import numpy -import io -import dill from collections import OrderedDict +from thinc.neural._classes.model import Model +import functools + +from .symbols import ORTH +from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ +from .compat import import_file import msgpack import msgpack_numpy msgpack_numpy.patch() -import ujson - -from .symbols import ORTH -from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ -from .compat import copy_array, normalize_string_keys, getattr_, import_file LANGUAGES = {} _data_path = Path(__file__).parent / 'data' +_PRINT_ENV = False + + +def set_env_log(value): + global _PRINT_ENV + _PRINT_ENV = value def get_lang_class(lang): @@ -36,11 +40,12 @@ def get_lang_class(lang): RETURNS (Language): Language class. """ global LANGUAGES - if not lang in LANGUAGES: + if lang not in LANGUAGES: try: module = importlib.import_module('.lang.%s' % lang, 'spacy') except ImportError: - raise ImportError("Can't import language %s from spacy.lang." %lang) + msg = "Can't import language %s from spacy.lang." + raise ImportError(msg % lang) LANGUAGES[lang] = getattr(module, module.__all__[0]) return LANGUAGES[lang] @@ -98,14 +103,14 @@ def load_model(name, **overrides): data_path = get_data_path() if not data_path or not data_path.exists(): raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) - if isinstance(name, basestring_): - if name in set([d.name for d in data_path.iterdir()]): # in data dir / shortcut + if isinstance(name, basestring_): # in data dir / shortcut + if name in set([d.name for d in data_path.iterdir()]): return load_model_from_link(name, **overrides) - if is_package(name): # installed as package + if is_package(name): # installed as package return load_model_from_package(name, **overrides) - if Path(name).exists(): # path to model data directory + if Path(name).exists(): # path to model data directory return load_model_from_path(Path(name), **overrides) - elif hasattr(name, 'exists'): # Path or Path-like to model data + elif hasattr(name, 'exists'): # Path or Path-like to model data return load_model_from_path(name, **overrides) raise IOError("Can't find model '%s'" % name) @@ -118,7 +123,7 @@ def load_model_from_link(name, **overrides): except AttributeError: raise IOError( "Cant' load '%s'. If you're using a shortcut link, make sure it " - "points to a valid model package (not just a data directory)." % name) + "points to a valid package (not just a data directory)." % name) return cls.load(**overrides) @@ -134,7 +139,18 @@ def load_model_from_path(model_path, meta=False, **overrides): if not meta: meta = get_model_meta(model_path) cls = get_lang_class(meta['lang']) - nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) + nlp = cls(meta=meta, **overrides) + pipeline = meta.get('pipeline', []) + disable = overrides.get('disable', []) + if pipeline is True: + pipeline = nlp.Defaults.pipe_names + elif pipeline in (False, None): + pipeline = [] + for name in pipeline: + if name not in disable: + config = meta.get('pipeline_args', {}).get(name, {}) + component = nlp.create_pipe(name, config=config) + nlp.add_pipe(component, name=name) return nlp.from_disk(model_path) @@ -151,7 +167,8 @@ def load_model_from_init_py(init_file, **overrides): data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) data_path = model_path / data_dir if not model_path.exists(): - raise ValueError("Can't find model directory: %s" % path2str(data_path)) + msg = "Can't find model directory: %s" + raise ValueError(msg % path2str(data_path)) return load_model_from_path(data_path, meta, **overrides) @@ -163,14 +180,16 @@ def get_model_meta(path): """ model_path = ensure_path(path) if not model_path.exists(): - raise ValueError("Can't find model directory: %s" % path2str(model_path)) + msg = "Can't find model directory: %s" + raise ValueError(msg % path2str(model_path)) meta_path = model_path / 'meta.json' if not meta_path.is_file(): raise IOError("Could not read meta.json from %s" % meta_path) meta = read_json(meta_path) for setting in ['lang', 'name', 'version']: if setting not in meta or not meta[setting]: - raise ValueError("No valid '%s' setting found in model meta.json" % setting) + msg = "No valid '%s' setting found in model meta.json" + raise ValueError(msg % setting) return meta @@ -180,9 +199,10 @@ def is_package(name): name (unicode): Name of package. RETURNS (bool): True if installed package, False if not. """ - packages = pip.get_installed_distributions() + name = name.lower() # compare package name against lowercase name + packages = pkg_resources.working_set.by_key.keys() for package in packages: - if package.project_name.replace('-', '_') == name: + if package.lower().replace('-', '_') == name: return True return False @@ -193,6 +213,7 @@ def get_package_path(name): name (unicode): Package name. RETURNS (Path): Path to installed package. """ + name = name.lower() # use lowercase version to be safe # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. pkg = importlib.import_module(name) @@ -225,7 +246,7 @@ def get_async(stream, numpy_array): return numpy_array else: array = cupy.ndarray(numpy_array.shape, order='C', - dtype=numpy_array.dtype) + dtype=numpy_array.dtype) array.set(numpy_array, stream=stream) return array @@ -259,12 +280,6 @@ def itershuffle(iterable, bufsize=1000): raise StopIteration -_PRINT_ENV = False -def set_env_log(value): - global _PRINT_ENV - _PRINT_ENV = value - - def env_opt(name, default=None): if type(default) is float: type_convert = float @@ -290,17 +305,20 @@ def read_regex(path): path = ensure_path(path) with path.open() as file_: entries = file_.read().split('\n') - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) + expression = '|'.join(['^' + re.escape(piece) + for piece in entries if piece.strip()]) return re.compile(expression) def compile_prefix_regex(entries): if '(' in entries: # Handle deprecated data - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) + expression = '|'.join(['^' + re.escape(piece) + for piece in entries if piece.strip()]) return re.compile(expression) else: - expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) + expression = '|'.join(['^' + piece + for piece in entries if piece.strip()]) return re.compile(expression) @@ -322,12 +340,16 @@ def add_lookups(default_func, *lookups): *lookups (dict): Lookup dictionary mapping string to attribute value. RETURNS (callable): Lexical attribute getter. """ - def get_attr(string): - for lookup in lookups: - if string in lookup: - return lookup[string] - return default_func(string) - return get_attr + # This is implemented as functools.partial instead of a closure, to allow + # pickle to work. + return functools.partial(_get_attr_unless_lookup, default_func, lookups) + + +def _get_attr_unless_lookup(default_func, lookups, string): + for lookup in lookups: + if string in lookup: + return lookup[string] + return default_func(string) def update_exc(base_exceptions, *addition_dicts): @@ -340,16 +362,15 @@ def update_exc(base_exceptions, *addition_dicts): exc = dict(base_exceptions) for additions in addition_dicts: for orth, token_attrs in additions.items(): - if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): - msg = "Invalid value for ORTH in exception: key='%s', orths='%s'" + if not all(isinstance(attr[ORTH], unicode_) + for attr in token_attrs): + msg = "Invalid ORTH value in exception: key='%s', orths='%s'" raise ValueError(msg % (orth, token_attrs)) described_orth = ''.join(attr[ORTH] for attr in token_attrs) if orth != described_orth: - raise ValueError("Invalid tokenizer exception: ORTH values " - "combined don't match original string. " - "key='%s', orths='%s'" % (orth, described_orth)) - # overlap = set(exc.keys()).intersection(set(additions)) - # assert not overlap, overlap + msg = ("Invalid tokenizer exception: ORTH values combined " + "don't match original string. key='%s', orths='%s'") + raise ValueError(msg % (orth, described_orth)) exc.update(additions) exc = expand_exc(exc, "'", "’") return exc @@ -382,17 +403,15 @@ def normalize_slice(length, start, stop, step=None): raise ValueError("Stepped slices not supported in Span objects." "Try: list(tokens)[start:stop:step] instead.") if start is None: - start = 0 + start = 0 elif start < 0: - start += length + start += length start = min(length, max(0, start)) - if stop is None: - stop = length + stop = length elif stop < 0: - stop += length + stop += length stop = min(length, max(start, stop)) - assert 0 <= start <= stop <= length return start, stop @@ -409,7 +428,7 @@ def compounding(start, stop, compound): >>> assert next(sizes) == 1.5 * 1.5 """ def clip(value): - return max(value, stop) if (start>stop) else min(value, stop) + return max(value, stop) if (start > stop) else min(value, stop) curr = float(start) while True: yield clip(curr) @@ -419,7 +438,7 @@ def compounding(start, stop, compound): def decaying(start, stop, decay): """Yield an infinite series of linearly decaying values.""" def clip(value): - return max(value, stop) if (start>stop) else min(value, stop) + return max(value, stop) if (start > stop) else min(value, stop) nr_upd = 1. while True: yield clip(start * 1./(1. + decay * nr_upd)) @@ -511,17 +530,19 @@ def print_markdown(data, title=None): if isinstance(data, dict): data = list(data.items()) - markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)] + markdown = ["* **{}:** {}".format(l, unicode_(v)) + for l, v in data if not excl_value(v)] if title: print("\n## {}".format(title)) print('\n{}\n'.format('\n'.join(markdown))) def prints(*texts, **kwargs): - """Print formatted message (manual ANSI escape sequences to avoid dependency) + """Print formatted message (manual ANSI escape sequences to avoid + dependency) *texts (unicode): Texts to print. Each argument is rendered as paragraph. - **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit. + **kwargs: 'title' becomes coloured headline. exits=True performs sys exit. """ exits = kwargs.get('exits', None) title = kwargs.get('title', None) @@ -551,9 +572,23 @@ def _wrap(text, wrap_max=80, indent=4): def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. - Disclaimer: NOT a general-purpose solution, only removes indentation/newlines. + Disclaimer: NOT a general-purpose solution, only removes indentation and + newlines. html (unicode): Markup to minify. RETURNS (unicode): "Minified" HTML. """ return html.strip().replace(' ', '').replace('\n', '') + + +def use_gpu(gpu_id): + try: + import cupy.cuda.device + except ImportError: + return None + from thinc.neural.ops import CupyOps + device = cupy.cuda.device.Device(gpu_id) + device.use() + Model.ops = CupyOps() + Model.Ops = CupyOps + return device diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 0eec5a00a..552a6bcf3 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,42 +1,80 @@ +# coding: utf8 from __future__ import unicode_literals -from libc.stdint cimport int32_t, uint64_t + import numpy from collections import OrderedDict import msgpack import msgpack_numpy msgpack_numpy.patch() cimport numpy as np +from thinc.neural.util import get_array_module +from thinc.neural._classes.model import Model -from .typedefs cimport attr_t from .strings cimport StringStore +from .compat import basestring_, path2str from . import util -from .compat import basestring_ cdef class Vectors: - '''Store, save and load word vectors.''' + """Store, save and load word vectors. + + Vectors data is kept in the vectors.data attribute, which should be an + instance of numpy.ndarray (for CPU vectors) or cupy.ndarray + (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to + rows in the vectors.data table. + + Multiple keys can be mapped to the same vector, so len(keys) may be greater + (but not smaller) than data.shape[0]. + """ cdef public object data cdef readonly StringStore strings cdef public object key2row cdef public object keys - cdef public int i + cdef public int _i_key + cdef public int _i_vec - def __init__(self, strings, data_or_width): - self.strings = StringStore() - if isinstance(data_or_width, int): - self.data = data = numpy.zeros((len(strings), data_or_width), - dtype='f') + def __init__(self, strings, width=0, data=None): + """Create a new vector store. To keep the vector table empty, pass + `width=0`. You can also create the vector table and add vectors one by + one, or set the vector values directly on initialisation. + + strings (StringStore or list): List of strings or StringStore that maps + strings to hash values, and vice versa. + width (int): Number of dimensions. + data (numpy.ndarray): The vector data. + RETURNS (Vectors): The newly created object. + """ + if isinstance(strings, StringStore): + self.strings = strings else: - data = data_or_width - self.i = 0 - self.data = data + self.strings = StringStore() + for string in strings: + self.strings.add(string) + if data is not None: + self.data = numpy.asarray(data, dtype='f') + else: + self.data = numpy.zeros((len(self.strings), width), dtype='f') + self._i_key = 0 + self._i_vec = 0 self.key2row = {} - self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') + self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64') + if data is not None: + for i, string in enumerate(self.strings): + if i >= self.data.shape[0]: + break + self.add(self.strings[string], vector=self.data[i]) def __reduce__(self): return (Vectors, (self.strings, self.data)) def __getitem__(self, key): + """Get a vector by key. If key is a string, it is hashed to an integer + ID using the vectors.strings table. If the integer key is not found in + the table, a KeyError is raised. + + key (unicode / int): The key to get the vector for. + RETURNS (numpy.ndarray): The vector for the key. + """ if isinstance(key, basestring): key = self.strings[key] i = self.key2row[key] @@ -46,56 +84,101 @@ cdef class Vectors: return self.data[i] def __setitem__(self, key, vector): + """Set a vector for the given key. If key is a string, it is hashed + to an integer ID using the vectors.strings table. + + key (unicode / int): The key to set the vector for. + vector (numpy.ndarray): The vector to set. + """ if isinstance(key, basestring): key = self.strings.add(key) i = self.key2row[key] self.data[i] = vector def __iter__(self): + """Yield vectors from the table. + + YIELDS (numpy.ndarray): A vector. + """ yield from self.data def __len__(self): - return self.i + """Return the number of vectors that have been assigned. + + RETURNS (int): The number of vectors in the data. + """ + return self._i_vec def __contains__(self, key): + """Check whether a key has a vector entry in the table. + + key (unicode / int): The key to check. + RETURNS (bool): Whether the key has a vector entry. + """ if isinstance(key, basestring_): key = self.strings[key] return key in self.key2row - def add(self, key, vector=None): + def add(self, key, *, vector=None, row=None): + """Add a key to the table. Keys can be mapped to an existing vector + by setting `row`, or a new vector can be added. + + key (unicode / int): The key to add. + vector (numpy.ndarray / None): A vector to add for the key. + row (int / None): The row-number of a vector to map the key to. + """ if isinstance(key, basestring_): key = self.strings.add(key) + if row is None and key in self.key2row: + row = self.key2row[key] + elif row is None: + row = self._i_vec + self._i_vec += 1 + if row >= self.data.shape[0]: + self.data.resize((row*2, self.data.shape[1])) if key not in self.key2row: - i = self.i - if i >= self.keys.shape[0]: - self.keys.resize((self.keys.shape[0]*2,)) - self.data.resize((self.data.shape[0]*2, self.data.shape[1])) - self.key2row[key] = self.i - self.keys[self.i] = key - self.i += 1 - else: - i = self.key2row[key] + if self._i_key >= self.keys.shape[0]: + self.keys.resize((self._i_key*2,)) + self.keys[self._i_key] = key + self._i_key += 1 + + self.key2row[key] = row if vector is not None: - self.data[i] = vector - return i + self.data[row] = vector + return row def items(self): - for i, string in enumerate(self.strings): - yield string, self.data[i] + """Iterate over `(string key, vector)` pairs, in order. + + YIELDS (tuple): A key/vector pair. + """ + for i, key in enumerate(self.keys): + string = self.strings[key] + row = self.key2row[key] + yield string, self.data[row] @property def shape(self): + """Get `(rows, dims)` tuples of number of rows and number of dimensions + in the vector table. + + RETURNS (tuple): A `(rows, dims)` pair. + """ return self.data.shape def most_similar(self, key): + # TODO: implement raise NotImplementedError def from_glove(self, path): - '''Load GloVe vectors from a directory. Assumes binary format, + """Load GloVe vectors from a directory. Assumes binary format, that the vocab is in a vocab.txt, and that vectors are named vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. - By default GloVe outputs 64-bit vectors.''' + By default GloVe outputs 64-bit vectors. + + path (unicode / Path): The path to load the GloVe vectors from. + """ path = util.ensure_path(path) for name in path.iterdir(): if name.parts[-1].startswith('vectors'): @@ -118,23 +201,41 @@ cdef class Vectors: self.data def to_disk(self, path, **exclude): + """Save the current state to a directory. + + path (unicode / Path): A path to a directory, which will be created if + it doesn't exists. Either a string or a Path-like object. + """ + xp = get_array_module(self.data) + if xp is numpy: + save_array = lambda arr, file_: xp.save(file_, arr, + allow_pickle=False) + else: + save_array = lambda arr, file_: xp.save(file_, arr) serializers = OrderedDict(( - ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)), - ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)), + ('vectors', lambda p: save_array(self.data, p.open('wb'))), + ('keys', lambda p: xp.save(p.open('wb'), self.keys)) )) return util.to_disk(path, serializers, exclude) def from_disk(self, path, **exclude): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode / Path): Directory path, string or Path-like object. + RETURNS (Vectors): The modified object. + """ def load_keys(path): if path.exists(): - self.keys = numpy.load(path) + self.keys = numpy.load(path2str(path)) for i, key in enumerate(self.keys): self.keys[i] = key self.key2row[key] = i def load_vectors(path): + xp = Model.ops.xp if path.exists(): - self.data = numpy.load(path) + self.data = xp.load(path) serializers = OrderedDict(( ('keys', load_keys), @@ -144,6 +245,11 @@ cdef class Vectors: return self def to_bytes(self, **exclude): + """Serialize the current state to a binary string. + + **exclude: Named attributes to prevent from being serialized. + RETURNS (bytes): The serialized form of the `Vectors` object. + """ def serialize_weights(): if hasattr(self.data, 'to_bytes'): return self.data.to_bytes() @@ -156,6 +262,12 @@ cdef class Vectors: return util.to_bytes(serializers, exclude) def from_bytes(self, data, **exclude): + """Load state from a binary string. + + data (bytes): The data to load from. + **exclude: Named attributes to prevent from being loaded. + RETURNS (Vectors): The `Vectors` object. + """ def deserialize_weights(b): if hasattr(self.data, 'from_bytes'): self.data.from_bytes() diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 8005cbf06..b12bccf38 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -32,6 +32,7 @@ cdef class Vocab: cdef readonly int length cdef public object data_dir cdef public object lex_attr_getters + cdef public object cfg cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bf7fb6903..0e6b69ebd 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,32 +1,25 @@ # coding: utf8 from __future__ import unicode_literals -import bz2 -import ujson -import re import numpy +import dill -from libc.string cimport memset, memcpy -from libc.stdint cimport int32_t -from libc.math cimport sqrt -from cymem.cymem cimport Address from collections import OrderedDict +from thinc.neural.util import get_array_module from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t -from .cfile cimport CFile from .tokens.token cimport Token -from .attrs cimport PROB, LANG +from .attrs cimport PROB, LANG, ORTH, TAG from .structs cimport SerializedLexemeC -from .compat import copy_reg, pickle, basestring_ +from .compat import copy_reg, basestring_ from .lemmatizer import Lemmatizer from .attrs import intify_attrs from .vectors import Vectors +from ._ml import link_vectors_to_models from . import util -from . import attrs -from . import symbols cdef class Vocab: @@ -35,23 +28,23 @@ cdef class Vocab: C-data that is shared between `Doc` objects. """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), **deprecated_kwargs): + strings=tuple(), oov_prob=-20., **deprecated_kwargs): """Create the vocabulary. - lex_attr_getters (dict): A dictionary mapping attribute IDs to functions - to compute them. Defaults to `None`. - tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained + lex_attr_getters (dict): A dictionary mapping attribute IDs to + functions to compute them. Defaults to `None`. + tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. lemmatizer (object): A lemmatizer. Defaults to `None`. strings (StringStore): StringStore that maps strings to integers, and vice versa. - RETURNS (Vocab): The newly constructed vocab object. + RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} tag_map = tag_map if tag_map is not None else {} if lemmatizer in (None, True, False): lemmatizer = Lemmatizer({}, {}, {}) - + self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() @@ -60,12 +53,9 @@ cdef class Vocab: if strings: for string in strings: _ = self[string] - for name in tag_map.keys(): - if name: - self.strings.add(name) self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings, tag_map, lemmatizer) - self.vectors = Vectors(self.strings, 300) + self.vectors = Vectors(self.strings, width=0) property lang: def __get__(self): @@ -86,19 +76,20 @@ cdef class Vocab: The flag_getter function will be called over the words currently in the vocab, and then applied to new words as they occur. You'll then be able - to access the flag value on each token, using token.check_flag(flag_id). + to access the flag value on each token using token.check_flag(flag_id). See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the flag - value. + flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest available bit will be chosen. RETURNS (int): The integer ID by which the flag value can be checked. EXAMPLE: - >>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy']) + >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy'] + >>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter) >>> doc = nlp(u'I like spaCy') >>> assert doc[2].check_flag(MY_PRODUCT) == True """ @@ -109,9 +100,10 @@ cdef class Vocab: break else: raise ValueError( - "Cannot find empty bit for new lexical flag. All bits between " - "0 and 63 are occupied. You can replace one by specifying the " - "flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA") + "Cannot find empty bit for new lexical flag. All bits " + "between 0 and 63 are occupied. You can replace one by " + "specifying the flag_id explicitly, e.g. " + "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") elif flag_id >= 64 or flag_id < 1: raise ValueError( "Invalid value for flag_id: %d. Flag IDs must be between " @@ -122,9 +114,9 @@ cdef class Vocab: return flag_id cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: - """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` - if necessary, using memory acquired from the given pool. If the pool - is the lexicon's own memory, the lexeme is saved in the lexicon. + """Get a pointer to a `LexemeC` from the lexicon, creating a new + `Lexeme` if necessary using memory acquired from the given pool. If the + pool is the lexicon's own memory, the lexeme is saved in the lexicon. """ if string == u'': return &EMPTY_LEXEME @@ -141,9 +133,9 @@ cdef class Vocab: return self._new_lexeme(mem, string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: - """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` - if necessary, using memory acquired from the given pool. If the pool - is the lexicon's own memory, the lexeme is saved in the lexicon. + """Get a pointer to a `LexemeC` from the lexicon, creating a new + `Lexeme` if necessary using memory acquired from the given pool. If the + pool is the lexicon's own memory, the lexeme is saved in the lexicon. """ if orth == 0: return &EMPTY_LEXEME @@ -205,8 +197,8 @@ cdef class Vocab: for orth, addr in self._by_orth.items(): yield Lexeme(self, orth) - def __getitem__(self, id_or_string): - """Retrieve a lexeme, given an int ID or a unicode string. If a + def __getitem__(self, id_or_string): + """Retrieve a lexeme, given an int ID or a unicode string. If a previously unseen unicode string is given, a new lexeme is created and stored. @@ -231,13 +223,14 @@ cdef class Vocab: cdef int i tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): - props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) + props = intify_attrs(props, strings_map=self.strings, + _do_deprecated=True) token = &tokens[i] # Set the special tokens up to have arbitrary attributes - lex = self.get_by_orth(self.mem, props[attrs.ORTH]) + lex = self.get_by_orth(self.mem, props[ORTH]) token.lex = lex - if attrs.TAG in props: - self.morphology.assign_tag(token, props[attrs.TAG]) + if TAG in props: + self.morphology.assign_tag(token, props[TAG]) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) Lexeme.set_struct_attr(lex, attr_id, value) @@ -247,25 +240,79 @@ cdef class Vocab: def vectors_length(self): return self.vectors.data.shape[1] - def clear_vectors(self, new_dim=None): + def clear_vectors(self, width=None): """Drop the current vector table. Because all vectors must be the same width, you have to call this to change the size of the vectors. """ - if new_dim is None: - new_dim = self.vectors.data.shape[1] - self.vectors = Vectors(self.strings, new_dim) + if width is None: + width = self.vectors.data.shape[1] + self.vectors = Vectors(self.strings, width=width) + + def prune_vectors(self, nr_row, batch_size=8): + """Reduce the current vector table to `nr_row` unique entries. Words + mapped to the discarded vectors will be remapped to the closest vector + among those remaining. + + For example, suppose the original table had vectors for the words: + ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to, + two rows, we would discard the vectors for 'feline' and 'reclined'. + These words would then be remapped to the closest remaining vector + -- so "feline" would have the same vector as "cat", and "reclined" + would have the same vector as "sat". + + The similarities are judged by cosine. The original vectors may + be large, so the cosines are calculated in minibatches, to reduce + memory usage. + + nr_row (int): The number of rows to keep in the vector table. + batch_size (int): Batch of vectors for calculating the similarities. + Larger batch sizes might be faster, while temporarily requiring + more memory. + RETURNS (dict): A dictionary keyed by removed words mapped to + `(string, score)` tuples, where `string` is the entry the removed + word was mapped to, and `score` the similarity score between the + two words. + """ + xp = get_array_module(self.vectors.data) + # Work in batches, to avoid memory problems. + keep = self.vectors.data[:nr_row] + keep_keys = [key for key, row in self.vectors.key2row.items() if row < nr_row] + toss = self.vectors.data[nr_row:] + # Normalize the vectors, so cosine similarity is just dot product. + # Note we can't modify the ones we're keeping in-place... + keep = keep / (xp.linalg.norm(keep, axis=1, keepdims=True)+1e-8) + keep = xp.ascontiguousarray(keep.T) + neighbours = xp.zeros((toss.shape[0],), dtype='i') + scores = xp.zeros((toss.shape[0],), dtype='f') + for i in range(0, toss.shape[0], batch_size): + batch = toss[i : i+batch_size] + batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8 + sims = xp.dot(batch, keep) + matches = sims.argmax(axis=1) + neighbours[i:i+batch_size] = matches + scores[i:i+batch_size] = sims.max(axis=1) + for lex in self: + # If we're losing the vector for this word, map it to the nearest + # vector we're keeping. + if lex.rank >= nr_row: + lex.rank = neighbours[lex.rank-nr_row] + self.vectors.add(lex.orth, row=lex.rank) + for key in self.vectors.keys: + row = self.vectors.key2row[key] + if row >= nr_row: + self.vectors.key2row[key] = neighbours[row-nr_row] + # Make copy, to encourage the original table to be garbage collected. + self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row]) + # TODO: return new mapping def get_vector(self, orth): - """Retrieve a vector for a word in the vocabulary. + """Retrieve a vector for a word in the vocabulary. Words can be looked + up by string or int ID. If no vectors data is loaded, ValueError is + raised. - Words can be looked up by string or int ID. - - RETURNS: - A word vector. Size and shape determed by the - vocab.vectors instance. Usually, a numpy ndarray - of shape (300,) and dtype float32. - - RAISES: If no vectors data is loaded, ValueError is raised. + RETURNS (numpy.ndarray): A word vector. Size + and shape determined by the `vocab.vectors` instance. Usually, a + numpy ndarray of shape (300,) and dtype float32. """ if isinstance(orth, basestring_): orth = self.strings.add(orth) @@ -275,21 +322,16 @@ cdef class Vocab: return numpy.zeros((self.vectors_length,), dtype='f') def set_vector(self, orth, vector): - """Set a vector for a word in the vocabulary. - - Words can be referenced by string or int ID. - - RETURNS: - None + """Set a vector for a word in the vocabulary. Words can be referenced + by string or int ID. """ if not isinstance(orth, basestring_): orth = self.strings[orth] self.vectors.add(orth, vector=vector) def has_vector(self, orth): - """Check whether a word has a vector. Returns False if no - vectors have been loaded. Words can be looked up by string - or int ID.""" + """Check whether a word has a vector. Returns False if no vectors have + been loaded. Words can be looked up by string or int ID.""" if isinstance(orth, basestring_): orth = self.strings.add(orth) return orth in self.vectors @@ -298,7 +340,7 @@ cdef class Vocab: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) if not path.exists(): @@ -323,6 +365,7 @@ cdef class Vocab: self.lexemes_from_bytes(file_.read()) if self.vectors is not None: self.vectors.from_disk(path, exclude='strings.json') + link_vectors_to_models(self) return self def to_bytes(self, **exclude): @@ -335,8 +378,8 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.to_bytes(exclude='strings.json') - + return self.vectors.to_bytes() + getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), ('lexemes', lambda: self.lexemes_to_bytes()), @@ -355,7 +398,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.from_bytes(b, exclude='strings') + return self.vectors.from_bytes(b) setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), @@ -397,6 +440,7 @@ cdef class Vocab: cdef int j = 0 cdef SerializedLexemeC lex_data chunk_size = sizeof(lex_data.data) + cdef void* ptr cdef unsigned char* bytes_ptr = bytes_data for i in range(0, len(bytes_data), chunk_size): lexeme = self.mem.alloc(1, sizeof(LexemeC)) @@ -404,6 +448,9 @@ cdef class Vocab: lex_data.data[j] = bytes_ptr[i+j] Lexeme.c_from_bytes(lexeme, lex_data) + ptr = self.strings._map.get(lexeme.orth) + if ptr == NULL: + continue py_str = self.strings[lexeme.orth] assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) key = hash_string(py_str) @@ -417,25 +464,23 @@ def pickle_vocab(vocab): morph = vocab.morphology length = vocab.length data_dir = vocab.data_dir - lex_attr_getters = vocab.lex_attr_getters - + lex_attr_getters = dill.dumps(vocab.lex_attr_getters) lexemes_data = vocab.lexemes_to_bytes() - return (unpickle_vocab, - (sstore, morph, data_dir, lex_attr_getters, - lexemes_data, length)) + (sstore, morph, data_dir, lex_attr_getters, lexemes_data, length)) def unpickle_vocab(sstore, morphology, data_dir, - lex_attr_getters, bytes lexemes_data, int length): + lex_attr_getters, bytes lexemes_data, int length): cdef Vocab vocab = Vocab() vocab.length = length vocab.strings = sstore vocab.morphology = morphology vocab.data_dir = data_dir - vocab.lex_attr_getters = lex_attr_getters + vocab.lex_attr_getters = dill.loads(lex_attr_getters) vocab.lexemes_from_bytes(lexemes_data) vocab.length = length + link_vectors_to_models(vocab) return vocab @@ -446,12 +491,10 @@ class LookupError(Exception): @classmethod def mismatched_strings(cls, id_, id_string, original_string): return cls( - "Error fetching a Lexeme from the Vocab. When looking up a string, " - "the lexeme returned had an orth ID that did not match the query string. " - "This means that the cached lexeme structs are mismatched to the " - "string encoding table. The mismatched:\n" - "Query string: {query}\n" - "Orth cached: {orth_str}\n" - "ID of orth: {orth_id}".format( - query=repr(original_string), orth_str=repr(id_string), orth_id=id_) - ) + "Error fetching a Lexeme from the Vocab. When looking up a " + "string, the lexeme returned had an orth ID that did not match " + "the query string. This means that the cached lexeme structs are " + "mismatched to the string encoding table. The mismatched:\n" + "Query string: {}\n" + "Orth cached: {}\n" + "Orth ID: {}".format(repr(original_string), repr(id_string), id_)) diff --git a/travis.sh b/travis.sh index 4b7d8017c..eed6a96f2 100755 --- a/travis.sh +++ b/travis.sh @@ -17,6 +17,7 @@ fi if [ "${VIA}" == "compile" ]; then pip install -r requirements.txt + python setup.py build_ext --inplace pip install -e . fi diff --git a/website/404.jade b/website/404.jade index 33b936a08..af4e7d0f2 100644 --- a/website/404.jade +++ b/website/404.jade @@ -8,4 +8,5 @@ include _includes/_mixins | does not exist! h2.c-landing__title.u-heading-3.u-padding-small - a(href="javascript:history.go(-1)") Click here to go back. + +button(false, true, "secondary-light")(href="javascript:history.go(-1)") + | Click here to go back diff --git a/website/_data.json b/website/_data.json index 525c70d80..53543b2d0 100644 --- a/website/_data.json +++ b/website/_data.json @@ -3,24 +3,22 @@ "landing": true, "logos": [ { - "quora": [ "https://www.quora.com", 150 ], - "chartbeat": [ "https://chartbeat.com", 200 ], - "duedil": [ "https://www.duedil.com", 150 ], - "stitchfix": [ "https://www.stitchfix.com", 190 ] + "airbnb": [ "https://www.airbnb.com", 150, 45], + "quora": [ "https://www.quora.com", 120, 34 ], + "retriever": [ "https://www.retriever.no", 150, 33 ], + "stitchfix": [ "https://www.stitchfix.com", 150, 18 ] }, { - "wayblazer": [ "http://wayblazer.com", 200 ], - "indico": [ "https://indico.io", 150 ], - "chattermill": [ "https://chattermill.io", 175 ], - "turi": [ "https://turi.com", 150 ], - "kip": [ "http://kipthis.com", 70 ] - }, + "chartbeat": [ "https://chartbeat.com", 180, 25 ], + "allenai": [ "https://allenai.org", 220, 37 ] + } + ], + "features": [ { - "socrata": [ "https://www.socrata.com", 150 ], - "cytora": [ "http://www.cytora.com", 125 ], - "signaln": [ "http://signaln.com", 150 ], - "wonderflow": [ "http://www.wonderflow.co", 200 ], - "synapsify": [ "http://www.gosynapsify.com", 150 ] + "thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28], + "wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77], + "venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19], + "microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28] } ] }, @@ -34,7 +32,24 @@ "landing": true }, - "announcement" : { - "title": "Important Announcement" + "styleguide": { + "title": "Styleguide", + "sidebar": { + "Styleguide": { "": "styleguide" }, + "Resources": { + "Website Source": "https://github.com/explosion/spacy/tree/master/website", + "Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md" + } + }, + "menu": { + "Introduction": "intro", + "Logo": "logo", + "Colors": "colors", + "Typography": "typography", + "Elements": "elements", + "Components": "components", + "Embeds": "embeds", + "Markup Reference": "markup" + } } } diff --git a/website/_harp.json b/website/_harp.json index 1c27426f4..bc1a0b5e5 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -11,12 +11,9 @@ "COMPANY": "Explosion AI", "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", + "MODELS_REPO": "explosion/spacy-models", - "SPACY_VERSION": "1.8", - "LATEST_NEWS": { - "url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha", - "title": "Test spaCy v2.0.0 alpha!" - }, + "SPACY_VERSION": "2.0", "SOCIAL": { "twitter": "spacy_io", @@ -27,25 +24,23 @@ }, "NAVIGATION": { - "Home": "/", - "Usage": "/docs/usage", - "Reference": "/docs/api", - "Demos": "/docs/usage/showcase", - "Blog": "https://explosion.ai/blog" + "Usage": "/usage", + "Models": "/models", + "API": "/api" }, "FOOTER": { "spaCy": { - "Usage": "/docs/usage", - "API Reference": "/docs/api", - "Tutorials": "/docs/usage/tutorials", - "Showcase": "/docs/usage/showcase" + "Usage": "/usage", + "Models": "/models", + "API Reference": "/api", + "Resources": "/usage/resources" }, "Support": { "Issue Tracker": "https://github.com/explosion/spaCy/issues", "StackOverflow": "http://stackoverflow.com/questions/tagged/spacy", - "Reddit usergroup": "https://www.reddit.com/r/spacynlp/", - "Gitter chat": "https://gitter.im/explosion/spaCy" + "Reddit Usergroup": "https://www.reddit.com/r/spacynlp/", + "Gitter Chat": "https://gitter.im/explosion/spaCy" }, "Connect": { "Twitter": "https://twitter.com/spacy_io", @@ -74,21 +69,11 @@ {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }, {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}] }, - { "id": "model", "title": "Models", "multiple": true, "options": [ - { "id": "en", "title": "English", "meta": "50MB" }, - { "id": "de", "title": "German", "meta": "645MB" }, - { "id": "fr", "title": "French", "meta": "1.33GB" }, - { "id": "es", "title": "Spanish", "meta": "377MB"}] - } + { "id": "model", "title": "Models", "multiple": true } ], "QUICKSTART_MODELS": [ - { "id": "lang", "title": "Language", "options": [ - { "id": "en", "title": "English", "checked": true }, - { "id": "de", "title": "German" }, - { "id": "fr", "title": "French" }, - { "id": "es", "title": "Spanish" }] - }, + { "id": "lang", "title": "Language"}, { "id": "load", "title": "Loading style", "options": [ { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, { "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }] @@ -98,50 +83,15 @@ } ], - "MODELS": { - "en": [ - { "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true }, - { "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" }, - { "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" }, - { "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" } - ], - "de": [ - { "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" } - ], - "fr": [ - { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" } - ], - "es": [ - { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"} - ] - }, - - "EXAMPLE_SENTENCES": { - "en": "This is a sentence.", - "de": "Dies ist ein Satz.", - "fr": "C'est une phrase.", - "es": "Esto es una frase." - }, - "ALPHA": true, - "V_CSS": "1.6", - "V_JS": "1.2", + "V_CSS": "2.0a2", + "V_JS": "2.0a1", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", "MAILCHIMP": { "user": "spacy.us12", "id": "83b0498b1e7fa3c91ce68c3f1", "list": "89ad33e698" - }, - "BADGES": { - "pipy": { - "badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square", - "link": "https://pypi.python.org/pypi/spacy" - }, - "conda": { - "badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg", - "link": "https://anaconda.org/conda-forge/spacy" - } } } } diff --git a/website/_includes/_footer.jade b/website/_includes/_footer.jade index e933f37a8..4d0d34cb5 100644 --- a/website/_includes/_footer.jade +++ b/website/_includes/_footer.jade @@ -1,8 +1,6 @@ //- 💫 INCLUDES > FOOTER -include _mixins - -footer.o-footer.u-text.u-border-dotted +footer.o-footer.u-text +grid.o-content each group, label in FOOTER +grid-col("quarter") @@ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted li +a(url)=item - if SECTION != "docs" + if SECTION == "index" +grid-col("quarter") include _newsletter - if SECTION == "docs" + if SECTION != "index" .o-content.o-block.u-border-dotted include _newsletter .o-inline-list.u-text-center.u-text-tiny.u-color-subtle span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY] - +a(COMPANY_URL, true) - +svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale + +a(COMPANY_URL, true)(aria-label="Explosion AI") + +icon("explosion", 45).o-icon.u-color-theme.u-grayscale +a(COMPANY_URL + "/legal", true) Legal / Imprint diff --git a/website/_includes/_functions.jade b/website/_includes/_functions.jade index e88e678cb..39139cc58 100644 --- a/website/_includes/_functions.jade +++ b/website/_includes/_functions.jade @@ -1,35 +1,68 @@ //- 💫 INCLUDES > FUNCTIONS -//- More descriptive variables for current.path and current.source +//- Descriptive variables, available in the global scope - CURRENT = current.source - SECTION = current.path[0] -- SUBSECTION = current.path[1] +- LANGUAGES = public.models._data.LANGUAGES +- MODELS = public.models._data.MODELS +- CURRENT_MODELS = MODELS[current.source] || [] + +- MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b) +- MODEL_LANG_COUNT = Object.keys(MODELS).length +- LANG_COUNT = Object.keys(LANGUAGES).length + +- MODEL_META = public.models._data.MODEL_META +- MODEL_LICENSES = public.models._data.MODEL_LICENSES +- MODEL_BENCHMARKS = public.models._data.MODEL_BENCHMARKS +- EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES + +- IS_PAGE = (SECTION != "index") && !landing +- IS_MODELS = (SECTION == "models" && LANGUAGES[current.source]) +- HAS_MODELS = IS_MODELS && CURRENT_MODELS.length //- Add prefixes to items of an array (for modifier CSS classes) + array - [array] list of class names or options, e.g. ["foot"] + prefix - [string] prefix to add to each class, e.g. "c-table__row" + RETURNS - [array] list of modified class names - function prefixArgs(array, prefix) { -- return array.map(function(arg) { -- return prefix + '--' + arg; -- }).join(' '); +- return array.map(arg => prefix + '--' + arg).join(' '); +- } + + +//- Convert API paths (semi-temporary fix for renamed sections) + path - [string] link path supplied to +api mixin + RETURNS - [string] new link path to correct location + +- function convertAPIPath(path) { +- if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) { +- var comps = path.split('#'); +- return "top-level#" + comps[0] + '.' + comps[1]; +- } +- return path; +- } + + +//- Get model components from ID. Components can then be looked up in LANGUAGES + and MODEL_META respectively, to get their human-readable form. + id - [string] model ID, e.g. "en_core_web_sm" + RETURNS - [object] object keyed by components lang, type, genre and size + +- function getModelComponents(id) { +- var comps = id.split('_'); +- return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]} - } //- Generate GitHub links + repo - [string] name of repo owned by explosion + filepath - [string] logical path to file relative to repository root + branch - [string] optional branch, defaults to "master" + RETURNS - [string] the correct link to the file on GitHub - function gh(repo, filepath, branch) { - var branch = ALPHA ? 'develop' : branch -- return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); -- } - - -//- Get social images - -- function getSocialImg() { -- var base = SITE_URL + '/assets/img/social/preview_' -- var image = ALPHA ? 'alpha' : 'default' -- if (preview) image = preview -- else if (SECTION == 'docs' && !ALPHA) image = 'docs' -- return base + image + '.jpg' +- return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); - } diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade deleted file mode 100644 index 7534a6f4e..000000000 --- a/website/_includes/_mixins-base.jade +++ /dev/null @@ -1,203 +0,0 @@ -//- 💫 MIXINS > BASE - -//- Aside wrapper - label - [string] aside label - -mixin aside-wrapper(label) - aside.c-aside - .c-aside__content(role="complementary")&attributes(attributes) - if label - h4.u-text-label.u-text-label--dark=label - - block - -//- Date - input - [string] date in the format YYYY-MM-DD - -mixin date(input) - - var date = new Date(input) - - var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] - - time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear() - - -//- SVG from map - file - [string] SVG file name in /assets/img/ - name - [string] SVG symbol id - width - [integer] width in px - height - [integer] height in px (default: same as width) - -mixin svg(file, name, width, height) - svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) - use(xlink:href="/assets/img/#{file}.svg##{name}") - - -//- Icon - name - [string] icon name, should be SVG symbol ID - size - [integer] icon width and height (default: 20) - -mixin icon(name, size) - - var size = size || 20 - +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes) - - -//- Pro/Con/Neutral icon - icon - [string] "pro", "con" or "neutral" (default: "neutral") - size - [integer] icon size (optional) - -mixin procon(icon, size) - - colors = { pro: "green", con: "red", neutral: "yellow" } - +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) - - -//- Headlines Helper Mixin - level - [integer] 1, 2, 3, 4, or 5 - -mixin headline(level) - if level == 1 - h1.u-heading-1&attributes(attributes) - block - - else if level == 2 - h2.u-heading-2&attributes(attributes) - block - - else if level == 3 - h3.u-heading-3&attributes(attributes) - block - - else if level == 4 - h4.u-heading-4&attributes(attributes) - block - - else if level == 5 - h5.u-heading-5&attributes(attributes) - block - - -//- Permalink rendering - id - [string] permalink ID used for link anchor - -mixin permalink(id) - if id - a.u-permalink(id=id href="##{id}") - +icon("anchor").u-permalink__icon - block - - else - block - - -//- Quickstart widget - quickstart.js with manual markup, inspired by PyTorch's "Getting started" - groups - [object] option groups, uses global variable QUICKSTART - headline - [string] optional text to be rendered as widget headline - -mixin quickstart(groups, headline, description, hide_results) - .c-quickstart.o-block-small#qs - .c-quickstart__content - if headline - +h(2)=headline - if description - p=description - for group in groups - .c-quickstart__group.u-text-small(data-qs-group=group.id) - if group.title - .c-quickstart__legend=group.title - if group.help - | #[+help(group.help)] - .c-quickstart__fields - for option in group.options - input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) - label.c-quickstart__label(for="qs-#{option.id}")!=option.title - if option.meta - | #[span.c-quickstart__label__meta (#{option.meta})] - if option.help - | #[+help(option.help)] - - if hide_results - block - else - pre.c-code-block - code.c-code-block__content.c-quickstart__code(data-qs-results="") - block - - .c-quickstart__info.u-text-tiny.o-block.u-text-right - | Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]! - - -//- Quickstart code item - data [object] - Rendering conditions (keyed by option group ID, value: option) - -mixin qs(data, style) - - args = {} - for value, setting in data - - args['data-qs-' + setting] = value - span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args) - block - - -//- Terminal-style code window - label - [string] title displayed in top bar of terminal window - -mixin terminal(label) - .x-terminal - .x-terminal__icons: span - .u-padding-small.u-text-label.u-text-center=label - - +code.x-terminal__code - block - - -//- Gitter chat button and widget - button - [string] text shown on button - label - [string] title of chat window (default: same as button) - -mixin gitter(button, label) - aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) - - button.js-gitter-button.c-chat__button.u-text-small - +icon("chat").o-icon--inline - !=button - - -//- Badge - name - [string] "pipy" or "conda" - -mixin badge(name) - - site = BADGES[name] - - if site - +a(site.link).u-padding-small - img(src=site.badge alt="{name} version" height="20") - - -//- Logo - -mixin logo() - +svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes) - - -//- Landing - -mixin landing-header() - header.c-landing - .c-landing__wrapper - .c-landing__content - block - - -mixin landing-badge(url, graphic, alt, size) - +a(url)(aria-label=alt title=alt).c-landing__badge - +svg("graphics", graphic, size || 225) - - -//- Under construction (temporary) - Marks sections that still need to be completed for the v2.0 release. - -mixin under-construction() - +infobox("🚧 Under construction") - | This section is still being written and will be updated for the v2.0 - | release. Is there anything that you think should definitely mentioned or - | explained here? Any examples you'd like to see? #[strong Let us know] - | on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index b140151b2..615160023 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -1,18 +1,66 @@ //- 💫 INCLUDES > MIXINS include _functions -include _mixins-base + + +//- Section + id - [string] anchor assigned to section (used for breadcrumb navigation) + +mixin section(id) + section.o-section(id="section-" + id data-section=id) + block + + +//- Headlines Helper Mixin + level - [integer] 1, 2, 3, 4, or 5 + +mixin headline(level) + if level == 1 + h1.u-heading-1&attributes(attributes) + block + + else if level == 2 + h2.u-heading-2&attributes(attributes) + block + + else if level == 3 + h3.u-heading-3&attributes(attributes) + block + + else if level == 4 + h4.u-heading-4&attributes(attributes) + block + + else if level == 5 + h5.u-heading-5&attributes(attributes) + block //- Headlines level - [integer] headline level, corresponds to h1, h2, h3 etc. id - [string] unique identifier, creates permalink (optional) -mixin h(level, id) - +headline(level).u-heading&attributes(attributes) +mixin h(level, id, source) + +headline(level).u-heading(id=id)&attributes(attributes) +permalink(id) block + if source + +button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right + span Source #[+icon("code", 14).o-icon--inline] + + +//- Permalink rendering + id - [string] permalink ID used for link anchor + +mixin permalink(id) + if id + a.u-permalink(href="##{id}") + block + + else + block + //- External links url - [string] link href @@ -38,21 +86,37 @@ mixin src(url) //- API link (with added tag and automatically generated path) - path - [string] path to API docs page relative to /docs/api/ + path - [string] path to API docs page relative to /api/ mixin api(path) - +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap + - path = convertAPIPath(path) + +a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap block - | #[+icon("book", 18).o-icon--inline.u-color-theme] + | #[+icon("book", 16).o-icon--inline.u-color-theme] //- Help icon with tooltip - tooltip - [string] Tooltip text + tooltip - [string] Tooltip text + icon_size - [integer] Optional size of help icon in px. -mixin help(tooltip) +mixin help(tooltip, icon_size) span(data-tooltip=tooltip)&attributes(attributes) - +icon("help", 16).i-icon--inline + if tooltip + span.u-hidden(aria-role="tooltip")=tooltip + +icon("help_o", icon_size || 16).o-icon--inline + + +//- Aside wrapper + label - [string] aside label + +mixin aside-wrapper(label) + aside.c-aside + .c-aside__content(role="complementary")&attributes(attributes) + if label + h4.u-text-label.u-text-label--dark=label + + block //- Aside for text @@ -68,24 +132,74 @@ mixin aside(label) label - [string] aside title (optional or false for no label) language - [string] language for syntax highlighting (default: "python") supports basic relevant languages available for PrismJS + prompt - [string] prompt displayed before first line, e.g. "$" -mixin aside-code(label, language) +mixin aside-code(label, language, prompt) +aside-wrapper(label) - +code(false, language).o-no-block + +code(false, language, prompt).o-no-block block //- Infobox label - [string] infobox title (optional or false for no title) + emoji - [string] optional emoji displayed before the title, necessary as + argument to be able to wrap it for spacing -mixin infobox(label) +mixin infobox(label, emoji) aside.o-box.o-block.u-text-small if label - h3.u-text-label.u-color-theme=label + h3.u-heading.u-text-label.u-color-theme + if emoji + span.o-emoji=emoji + | #{label} block +//- Logos displayed in the top corner of some infoboxes + logos - [array] List of icon ID, width, height and link. + +mixin infobox-logos(...logos) + .o-box__logos.u-text-right.u-float-right + for logo in logos + if logo[3] + | #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]] + else + | #[+icon(logo[0], logo[1], logo[2]).u-color-dark] + + +//- SVG from map (uses embedded SVG sprite) + name - [string] SVG symbol id + width - [integer] width in px + height - [integer] height in px (default: same as width) + +mixin svg(name, width, height) + svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) + use(xlink:href="#svg_#{name}") + + +//- Icon + name - [string] icon name (will be used as symbol id: #svg_{name}) + width - [integer] icon width (default: 20) + height - [integer] icon height (defaults to width) + +mixin icon(name, width, height) + - var width = width || 20 + - var height = height || width + +svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes) + + +//- Pro/Con/Neutral icon + icon - [string] "pro", "con" or "neutral" (default: "neutral") + size - [integer] icon size (optional) + +mixin procon(icon, label, show_label, size) + - var colors = { yes: "green", no: "red", neutral: "subtle" } + span.u-nowrap + +icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes) + span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon) + + //- Link button url - [string] link href trusted - [boolean] if not set / false, rel="noopener nofollow" is added @@ -94,7 +208,7 @@ mixin infobox(label) see assets/css/_components/_buttons.sass mixin button(url, trusted, ...style) - - external = url.includes("http") + - external = url && url.includes("http") a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes) block @@ -103,31 +217,42 @@ mixin button(url, trusted, ...style) label - [string] aside title (optional or false for no label) language - [string] language for syntax highlighting (default: "python") supports basic relevant languages available for PrismJS - prompt - [string] prompt or icon to display next to code block, (mostly used for old/new) + prompt - [string] prompt displayed before first line, e.g. "$" height - [integer] optional height to clip code block to + icon - [string] icon displayed next to code block (e.g. "accept" for new code) + wrap - [boolean] wrap text and disable horizontal scrolling -mixin code(label, language, prompt, height) +mixin code(label, language, prompt, height, icon, wrap) pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) if label h4.u-text-label.u-text-label--dark=label - - var icon = (prompt == 'accept' || prompt == 'reject') + - var icon = icon || (prompt == 'accept' || prompt == 'reject') if icon - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) +icon(icon, 18) - code.c-code-block__content(data-prompt=icon ? null : prompt) + code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt) block -//- Code blocks to display old/new versions +//- Wrapper for code blocks to display old/new versions -mixin code-old() - +code(false, false, "reject").o-block-small +mixin code-wrapper() + span.u-inline-block.u-padding-top.u-width-full block -mixin code-new() - +code(false, false, "accept").o-block-small +//- Code blocks to display old/new versions + label - [string] ARIA label for block. Defaults to "correct"/"incorrect". + +mixin code-old(label) + - var label = label || 'incorrect' + +code(false, false, false, false, "reject").o-block-small(aria-label=label) + block + +mixin code-new(label) + - var label = label || 'correct' + +code(false, false, false, false, "accept").o-block-small(aria-label=label) block @@ -138,12 +263,38 @@ mixin code-new() mixin codepen(slug, height, default_tab) figure.o-block(style="min-height: #{height}px")&attributes(attributes) - .codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) + .codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen script(async src="https://assets.codepen.io/assets/embed/ei.js") +//- GitHub embed + repo - [string] repository owned by explosion organization + file - [string] logical path to file, relative to repository root + alt_file - [string] alternative file path used in footer and link button + height - [integer] height of code preview in px + +mixin github(repo, file, height, alt_file, language) + - var branch = ALPHA ? "develop" : "master" + - var height = height || 250 + + figure.o-block + pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px") + code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}"). + Can't fetch code example from GitHub :( + + Please use the link below to view the example. If you've come across + a broken link, we always appreciate a pull request to the repository, + or a report on the issue tracker. Thanks! + + footer.o-grid.u-text + .o-block-small.u-flex-full.u-padding-small #[+icon("github")] #[code.u-break.u-break--all=repo + '/' + (alt_file || file)] + div + +button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub + + + //- Images / figures url - [string] url or path to image width - [integer] image width in px, for better rendering (default: 500) @@ -168,18 +319,48 @@ mixin image-caption() block -//- Label +//- Graphic or illustration with button + original - [string] Path to original image + +mixin graphic(original) + +image + block + if original + .u-text-right + +button(original, false, "secondary", "small") View large graphic + + +//- Chart.js + id - [string] chart ID, will be assigned as #chart_{id} + +mixin chart(id, height) + figure.o-block&attributes(attributes) + canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%") + + +//- Labels mixin label() - .u-text-label.u-color-subtle&attributes(attributes) + .u-text-label.u-color-dark&attributes(attributes) + block + + +mixin label-inline() + strong.u-text-label.u-color-dark&attributes(attributes) block //- Tag + tooltip - [string] optional tooltip text. + hide_icon - [boolean] hide tooltip icon -mixin tag() - span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes) +mixin tag(tooltip, hide_icon) + div.u-text-tag.u-text-tag--spaced(data-tooltip=tooltip)&attributes(attributes) block + if tooltip + if !hide_icon + | #[+icon("help", 12).o-icon--tag] + | #[span.u-hidden(aria-role="tooltip")=tooltip] //- "Requires model" tag with tooltip and list of capabilities @@ -188,8 +369,7 @@ mixin tag() mixin tag-model(...capabs) - var intro = "To use this functionality, spaCy needs a model to be installed" - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : "" - +tag Requires model - +help(intro + ext + ".").u-color-theme + +tag(intro + ext + ".") Needs model //- "New" tag to label features new in a specific version @@ -199,8 +379,8 @@ mixin tag-model(...capabs) mixin tag-new(version) - var version = (typeof version == 'number') ? version.toFixed(1) : version - +tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.") - | v#{version} + - var tooltip = "This feature is new and was introduced in spaCy v" + version + +tag(tooltip, true) v#{version} //- List @@ -219,15 +399,9 @@ mixin list(type, start) //- List item (only used within +list) -mixin item(procon) - if procon - li&attributes(attributes) - +procon(procon).c-list__icon - block - - else - li.c-list__item&attributes(attributes) - block +mixin item() + li.c-list__item&attributes(attributes) + block //- Table @@ -237,9 +411,9 @@ mixin table(head) table.c-table.o-block&attributes(attributes) if head - +row + +row("head") each column in head - th.c-table__head-cell.u-text-label=column + +head-cell=column block @@ -251,10 +425,11 @@ mixin row(...style) block -//- Footer table row (only ued within +table) -mixin footrow() - tr.c-table__row.c-table__row--foot&attributes(attributes) +//- Header table cell (only used within +row) + +mixin head-cell() + th.c-table__head-cell.u-text-label&attributes(attributes) block @@ -278,77 +453,64 @@ mixin grid(...style) width - [string] "quarter", "third", "half", "two-thirds", "three-quarters" see $grid in assets/css/_variables.sass -mixin grid-col(width) - .o-grid__col(class="o-grid__col--#{width}")&attributes(attributes) +mixin grid-col(...style) + .o-grid__col(class=prefixArgs(style, "o-grid__col"))&attributes(attributes) block //- Card (only used within +grid) - title - [string] card title - details - [object] url, image, author, description, tags etc. - (see /docs/usage/_data.json) + title - [string] card title + url - [string] link for card + author - [string] optional author, displayed as byline at the bottom + icon - [string] optional ID of icon displayed with card + width - [string] optional width of grid column, defaults to "half" -mixin card(title, details) - +grid-col("half").o-card.u-text&attributes(attributes) - if details.image - +a(details.url).o-block-small - img(src=details.image alt=title width="300" role="presentation") - - if title - +a(details.url) - +h(3)=title - - if details.author - .u-text-small.u-color-subtle by #{details.author} - - if details.description || details.tags - ul - if details.description - li=details.description - - if details.tags - li - each tag in details.tags - span.u-text-tag #{tag} - |   - - block +mixin card(title, url, author, icon, width) + +grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes) + +a(url) + h4.u-heading.u-text-label + if icon + +icon(icon, 25).u-float-right + if title + span.u-color-dark=title + .o-block-small.u-text-small + block + if author + .u-color-subtle.u-text-tiny by #{author} -//- Simpler card list item (only used within +list) - title - [string] card title - details - [object] url, image, author, description, tags etc. - (see /docs/usage/_data.json) +//- Table of contents, to be used with +item mixins for links + col - [string] width of column (see +grid-col) -mixin card-item(title, details) - +item&attributes(attributes) - +a(details.url)=title - - if details.description - br - span=details.description - - if details.author - br - span.u-text-small.u-color-subtle by #{details.author} +mixin table-of-contents(col) + +grid-col(col || "half") + +infobox + +label.o-block-small Table of contents + +list("numbers").u-text-small.o-no-block + block -//- Table row for models table +//- Bibliography + id - [string] ID of bibliography component, for anchor links. Can be used if + there's more than one bibliography on one page. -mixin model-row(name, lang, procon, size, license, default_model, divider) - - var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" } +mixin bibliography(id) + section(id=id || "bibliography") + +infobox + +label.o-block-small Bibliography + +list("numbers").u-text-small.o-no-block + block - +row(divider ? "divider": null) - +cell #[code=name] - if default_model - | #[span.u-color-theme(title="default model") #[+icon("star", 16)]] - +cell=lang - each icon in procon - +cell.u-text-center #[+procon(icon ? "pro" : "con")] - +cell.u-text-right=size - +cell - if license in licenses - +a(licenses[license])=license + +//- Footnote + id - [string / integer] ID of footnote. + bib_id - [string] ID of bibliography component, defaults to "bibliography". + tooltip - [string] optional text displayed as tooltip + +mixin fn(id, bib_id, tooltip) + sup.u-padding-small(id="bib" + id data-tooltip=tooltip) + span.u-text-tag + +a("#" + (bib_id || "bibliography")).u-hide-link #{id} //- Table rows for annotation specs @@ -385,12 +547,135 @@ mixin annotation-row(annots, style) block -//- Table of contents, to be used with +item mixins for links - col - [string] width of column (see +grid-col) +//- spaCy logo -mixin table-of-contents(col) - +grid-col(col || "half") - +infobox - +label.o-block-small Table of contents - +list("numbers").u-text-small.o-no-block +mixin logo() + +svg("spacy", 675, 215).o-logo&attributes(attributes) + + +//- Gitter chat button and widget + button - [string] text shown on button + label - [string] title of chat window (default: same as button) + +mixin gitter(button, label) + aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) + + button.js-gitter-button.c-chat__button.u-text-tag + +icon("chat", 16).o-icon--inline + !=button + + +//- Badge + image - [string] path to badge image + url - [string] badge link + +mixin badge(image, url) + +a(url).u-padding-small.u-hide-link&attributes(attributes) + img.o-badge(src=image alt=url height="20") + + +//- Quickstart widget + quickstart.js with manual markup, inspired by PyTorch's "Getting started" + groups - [object] option groups, uses global variable QUICKSTART + headline - [string] optional text to be rendered as widget headline + +mixin quickstart(groups, headline, description, hide_results) + .c-quickstart.o-block-small#qs + .c-quickstart__content + if headline + +h(2)=headline + if description + p=description + for group in groups + .c-quickstart__group.u-text-small(data-qs-group=group.id) + if group.title + .c-quickstart__legend=group.title + if group.help + | #[+help(group.help)] + .c-quickstart__fields + for option in group.options + input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) + label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title + if option.meta + | #[span.c-quickstart__label__meta (#{option.meta})] + if option.help + | #[+help(option.help)] + + if hide_results + block + else + pre.c-code-block + code.c-code-block__content.c-quickstart__code(data-qs-results="") + block + + +//- Quickstart code item + data - [object] Rendering conditions (keyed by option group ID, value: option) + style - [string] modifier ID for line style + +mixin qs(data, style) + - args = {} + for value, setting in data + - args['data-qs-' + setting] = value + span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args) + block + + +//- Terminal-style code window + label - [string] title displayed in top bar of terminal window + +mixin terminal(label) + .x-terminal + .x-terminal__icons: span + .u-padding-small.u-text-label.u-text-center=label + + +code.x-terminal__code + block + + +//- Landing + +mixin landing-header() + header.c-landing + .c-landing__wrapper + .c-landing__content block + +mixin landing-banner(headline, label) + .c-landing__banner.u-padding.o-block.u-color-light + +grid.c-landing__banner__content.o-no-block + +grid-col("third") + h3.u-heading.u-heading-1 + if label + div + span.u-text-label.u-text-label--light=label + !=headline + + +grid-col("two-thirds").c-landing__banner__text + block + + +mixin landing-logos(title, logos) + .o-content.u-text-center&attributes(attributes) + h3.u-heading.u-text-label.u-color-dark=title + + each row, i in logos + - var is_last = i == logos.length - 1 + +grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null) + each details, name in row + +a(details[0]).u-padding-medium + +icon(name, details[1], details[2]) + + if is_last + block + + +//- Under construction (temporary) + Marks sections that still need to be completed for the v2.0 release. + +mixin under-construction() + +infobox("Under construction", "🚧") + | This section is still being written and will be updated for the v2.0 + | release. Is there anything that you think should definitely mentioned or + | explained here? Any examples you'd like to see? #[strong Let us know] + | on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! diff --git a/website/_includes/_navigation.jade b/website/_includes/_navigation.jade index f113ca3f4..c7f2c956f 100644 --- a/website/_includes/_navigation.jade +++ b/website/_includes/_navigation.jade @@ -1,19 +1,15 @@ //- 💫 INCLUDES > TOP NAVIGATION -include _mixins - nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null) - a(href='/') #[+logo] - - if SUBSECTION != "index" - .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION + a(href="/" aria-label=SITENAME) #[+logo] ul.c-nav__menu - - var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION - - each url, item in NAV - li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null) + - var current_url = '/' + current.path[0] + each url, item in NAVIGATION + li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null) +a(url)=item - li.c-nav__menu__item - +a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)] + li.c-nav__menu__item.u-hidden-xs + +a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)] + + progress.c-progress.js-progress(value="0" max="1") diff --git a/website/_includes/_newsletter.jade b/website/_includes/_newsletter.jade index 9bfe88d39..ca8333f86 100644 --- a/website/_includes/_newsletter.jade +++ b/website/_includes/_newsletter.jade @@ -1,6 +1,6 @@ //- 💫 INCLUDES > NEWSLETTER -ul.o-block +ul.o-block-small li.u-text-label.u-color-subtle Stay in the loop! li Receive updates about new releases, tutorials and more. @@ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c div(style="position: absolute; left: -5000px;" aria-hidden="true") input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="") - .o-grid-col.u-border.u-padding-small - input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email") - - button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up + .o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small + input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email") + button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade index 7afbc6bdc..6295491a6 100644 --- a/website/_includes/_page-docs.jade +++ b/website/_includes/_page-docs.jade @@ -1,47 +1,53 @@ //- 💫 INCLUDES > DOCS PAGE TEMPLATE -- sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER +- sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER include _sidebar main.o-main.o-main--sidebar.o-main--aside article.o-content +grid.o-no-block - +grid-col(source ? "two-thirds" : "full") - +h(1)=title - if tag - +tag=tag + +h(1).u-heading--title=title.replace("'", "’") + if tag + +tag=tag + if tag_new + +tag-new(tag_new) + + if teaser + .u-heading__teaser.u-text-small.u-color-dark=teaser + else if IS_MODELS + .u-heading__teaser.u-text-small.u-color-dark + | Available statistical models for + | #[code=current.source] (#{LANGUAGES[current.source]}). if source - +grid-col("third").u-text-right - .o-inline-list - +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)] + .o-block.u-text-right + +button(gh("spacy", source), false, "secondary", "small").u-nowrap + | Source #[+icon("code", 14)] - - if ALPHA - +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs") - strong This page is part of the alpha documentation for spaCy v2.0. - | It does not reflect the state of the latest stable release. - | Because v2.0 is still under development, the implementation - | may differ from the intended state described here. See the - | #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes] - | for details on how to install and test the new version. To - | read the official docs for spaCy v1.x, - | #[+a("https://spacy.io/docs") go here]. - - !=yield + if IS_MODELS + include _page_models + else + !=yield +grid.o-content.u-text +grid-col("half") - if next && public.docs[SUBSECTION]._data[next] - - data = public.docs[SUBSECTION]._data[next] - + if !IS_MODELS .o-inline-list - span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title] + +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small") + | #[span.o-icon Suggest edits] #[+icon("code", 14)] +grid-col("half").u-text-right - .o-inline-list - +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] + if next && public[SECTION]._data[next] + - data = public[SECTION]._data[next] + + +grid("vcenter") + +a(next).u-text-small.u-flex-full + h4.u-text-label.u-color-dark Read next + | #{data.title} + + +a(next).c-icon-button.c-icon-button--right(aria-hidden="true") + +icon("arrow-right", 24) +gitter("spaCy chat") diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade new file mode 100644 index 000000000..1cab930fb --- /dev/null +++ b/website/_includes/_page_models.jade @@ -0,0 +1,75 @@ +//- 💫 INCLUDES > MODELS PAGE TEMPLATE + +for id in CURRENT_MODELS + +section(id) + +grid("vcenter").o-no-block(id=id) + +grid-col("two-thirds") + +h(2) + +a("#" + id).u-permalink=id + + +grid-col("third").u-text-right + .u-color-subtle.u-text-tiny + +button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download") + | Release details + .u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a] + + +aside-code("Installation", "bash", "$"). + spacy download #{id} + + - var comps = getModelComponents(id) + + p(data-tpl=id data-tpl-key="description") + + div(data-tpl=id data-tpl-key="error") + +infobox + | Unable to load model details from GitHub. To find out more + | about this model, see the overview of the + | #[+a(gh("spacy-models") + "/releases") latest model releases]. + + +table.o-block-small(data-tpl=id data-tpl-key="table") + +row + +cell #[+label Language] + +cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]} + for comp, label in {"Type": comps.type, "Genre": comps.genre} + +row + +cell #[+label=label] + +cell #[+tag=comp] #{MODEL_META[comp]} + +row + +cell #[+label Size] + +cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]] + + each label in ["Pipeline", "Vectors", "Sources", "Author", "License"] + - var field = label.toLowerCase() + +row + +cell.u-nowrap + +label=label + if MODEL_META[field] + | #[+help(MODEL_META[field]).u-color-subtle] + +cell + span(data-tpl=id data-tpl-key=field) #[em n/a] + + +row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none") + +cell + +label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle] + +cell + .o-field.u-float-left + select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat") + div(data-tpl=id data-tpl-key="compat-versions")   + + section(data-tpl=id data-tpl-key="benchmarks" style="display: none") + +grid.o-block-small + for keys, label in MODEL_BENCHMARKS + .u-flex-full.u-padding-small(data-tpl=id data-tpl-key=label.toLowerCase() style="display: none") + +table.o-block-small + +row("head") + +head-cell(colspan="2")=(MODEL_META["benchmark_" + label] || label) + for label, field in keys + +row(style="display: none") + +cell.u-nowrap + +label=label + if MODEL_META[field] + | #[+help(MODEL_META[field]).u-color-subtle] + +cell.u-text-right(data-tpl=id data-tpl-key=field) + | n/a + + p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes") diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade index e5a863787..05a468076 100644 --- a/website/_includes/_scripts.jade +++ b/website/_includes/_scripts.jade @@ -1,27 +1,86 @@ //- 💫 INCLUDES > SCRIPTS -script(src="/assets/js/main.js?v#{V_JS}") -script(src="/assets/js/prism.js") +if quickstart + script(src="/assets/js/vendor/quickstart.min.js") -if SECTION == "docs" - if quickstart - script(src="/assets/js/quickstart.js") - script var qs = new Quickstart("#qs") - - script. - ((window.gitter = {}).chat = {}).options = { - useStyles: false, - activationElement: '.js-gitter-button', - targetElement: '.js-gitter', - room: '!{SOCIAL.gitter}' - }; - - script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) +if IS_PAGE + script(src="/assets/js/vendor/in-view.min.js") if environment == "deploy" - script + script(async src="https://www.google-analytics.com/analytics.js") + +script(src="/assets/js/vendor/prism.min.js") + +if SECTION == "models" + script(src="/assets/js/vendor/chart.min.js") + script(src="/assets/js/models.js?v#{V_JS}" type="module") + +script + if quickstart + | new Quickstart("#qs"); + + if environment == "deploy" | window.ga=window.ga||function(){ | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); - script(async src="https://www.google-analytics.com/analytics.js") + +if IS_PAGE + script + | ((window.gitter = {}).chat = {}).options = { + | useStyles: false, + | activationElement: '.js-gitter-button', + | targetElement: '.js-gitter', + | room: '!{SOCIAL.gitter}' + | }; + script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) + + +//- JS modules – slightly hacky, but necessary to dynamically instantiate the + classes with data from the Harp JSON files, while still being able to + support older browsers that can't handle JS modules. More details: + https://medium.com/dev-channel/es6-modules-in-chrome-canary-m60-ba588dfb8ab7 + +- ProgressBar = "new ProgressBar('.js-progress');" +- Changelog = "new Changelog('" + SOCIAL.github + "', 'spacy');" +- NavHighlighter = "new NavHighlighter('data-section', 'data-nav');" +- GitHubEmbed = "new GitHubEmbed('" + SOCIAL.github + "', 'data-gh-embed');" +- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");" +- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");" + +//- Browsers with JS module support. + Will be ignored otherwise. + +script(type="module") + | import ProgressBar from '/assets/js/progress.js'; + !=ProgressBar + if changelog + | import Changelog from '/assets/js/changelog.js'; + !=Changelog + if IS_PAGE + | import NavHighlighter from '/assets/js/nav-highlighter.js'; + !=NavHighlighter + | import GitHubEmbed from '/assets/js/github-embed.js'; + !=GitHubEmbed + if HAS_MODELS + | import { ModelLoader } from '/assets/js/models.js'; + !=ModelLoader + if compare_models + | import { ModelComparer } from '/assets/js/models.js'; + !=ModelComparer + +//- Browsers with no JS module support. + Won't be fetched or interpreted otherwise. + +script(nomodule src="/assets/js/rollup.js") +script(nomodule) + !=ProgressBar + if changelog + !=Changelog + if IS_PAGE + !=NavHighlighter + !=GitHubEmbed + if HAS_MODELS + !=ModeLoader + if compare_models + !=ModelComparer diff --git a/website/_includes/_sidebar.jade b/website/_includes/_sidebar.jade index 241a77132..9b9cd00a3 100644 --- a/website/_includes/_sidebar.jade +++ b/website/_includes/_sidebar.jade @@ -1,13 +1,23 @@ //- 💫 INCLUDES > SIDEBAR -include _mixins - menu.c-sidebar.js-sidebar.u-text if sidebar_content - each items, menu in sidebar_content - ul.c-sidebar__section.o-block - li.u-text-label.u-color-subtle=menu + each items, sectiontitle in sidebar_content + ul.c-sidebar__section.o-block-small + li.u-text-label.u-color-dark=sectiontitle each url, item in items - li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null) - +a(url)=item + - var is_current = CURRENT == url || (CURRENT == "index" && url == "./") + li.c-sidebar__item + +a(url)(class=is_current ? "is-active" : null)=item + + if is_current + if IS_MODELS && CURRENT_MODELS.length + - menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id }))) + if menu + ul.c-sidebar__crumb.u-hidden-sm + - var counter = 0 + for id, title in menu + - counter++ + li.c-sidebar__crumb__item(data-nav=id) + +a("#section-" + id)=title diff --git a/website/_includes/_svg.jade b/website/_includes/_svg.jade new file mode 100644 index 000000000..54e0667a3 --- /dev/null +++ b/website/_includes/_svg.jade @@ -0,0 +1,169 @@ +//- 💫 INCLUDES > SVG + +svg(style="position: absolute; visibility: hidden; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink") + defs + //- UI icons + + symbol#svg_github(viewBox="0 0 27 32") + path(d="M13.714 2.286q3.732 0 6.884 1.839t4.991 4.991 1.839 6.884q0 4.482-2.616 8.063t-6.759 4.955q-0.482 0.089-0.714-0.125t-0.232-0.536q0-0.054 0.009-1.366t0.009-2.402q0-1.732-0.929-2.536 1.018-0.107 1.83-0.321t1.679-0.696 1.446-1.188 0.946-1.875 0.366-2.688q0-2.125-1.411-3.679 0.661-1.625-0.143-3.643-0.5-0.161-1.446 0.196t-1.643 0.786l-0.679 0.429q-1.661-0.464-3.429-0.464t-3.429 0.464q-0.286-0.196-0.759-0.482t-1.491-0.688-1.518-0.241q-0.804 2.018-0.143 3.643-1.411 1.554-1.411 3.679 0 1.518 0.366 2.679t0.938 1.875 1.438 1.196 1.679 0.696 1.83 0.321q-0.696 0.643-0.875 1.839-0.375 0.179-0.804 0.268t-1.018 0.089-1.17-0.384-0.991-1.116q-0.339-0.571-0.866-0.929t-0.884-0.429l-0.357-0.054q-0.375 0-0.518 0.080t-0.089 0.205 0.161 0.25 0.232 0.214l0.125 0.089q0.393 0.179 0.777 0.679t0.563 0.911l0.179 0.411q0.232 0.679 0.786 1.098t1.196 0.536 1.241 0.125 0.991-0.063l0.411-0.071q0 0.679 0.009 1.58t0.009 0.973q0 0.321-0.232 0.536t-0.714 0.125q-4.143-1.375-6.759-4.955t-2.616-8.063q0-3.732 1.839-6.884t4.991-4.991 6.884-1.839zM5.196 21.982q0.054-0.125-0.125-0.214-0.179-0.054-0.232 0.036-0.054 0.125 0.125 0.214 0.161 0.107 0.232-0.036zM5.75 22.589q0.125-0.089-0.036-0.286-0.179-0.161-0.286-0.054-0.125 0.089 0.036 0.286 0.179 0.179 0.286 0.054zM6.286 23.393q0.161-0.125 0-0.339-0.143-0.232-0.304-0.107-0.161 0.089 0 0.321t0.304 0.125zM7.036 24.143q0.143-0.143-0.071-0.339-0.214-0.214-0.357-0.054-0.161 0.143 0.071 0.339 0.214 0.214 0.357 0.054zM8.054 24.589q0.054-0.196-0.232-0.286-0.268-0.071-0.339 0.125t0.232 0.268q0.268 0.107 0.339-0.107zM9.179 24.679q0-0.232-0.304-0.196-0.286 0-0.286 0.196 0 0.232 0.304 0.196 0.286 0 0.286-0.196zM10.214 24.5q-0.036-0.196-0.321-0.161-0.286 0.054-0.25 0.268t0.321 0.143 0.25-0.25z") + + symbol#svg_code(viewBox="0 0 20 20") + path(d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z") + + symbol#svg_arrow-right(viewBox="0 0 24 24") + path(d="M20.744 12.669c0 0 0 0 0 0 0.006-0.006 0.006-0.006 0.006-0.006s0 0 0 0 0.006-0.006 0.006-0.006c0 0 0.006-0.006 0.006-0.006s0 0 0 0 0.006-0.006 0.006-0.006c0 0 0 0 0 0 0.063-0.075 0.112-0.156 0.15-0.244 0 0 0 0 0-0.006 0 0 0-0.006 0-0.006s0-0.006 0-0.006c0 0 0 0 0 0 0.038-0.094 0.063-0.194 0.069-0.3 0 0 0 0 0 0s0-0.006 0-0.006c0 0 0-0.006 0-0.006s0-0.006 0-0.006c0 0 0-0.006 0-0.006s0 0 0-0.006c0-0.025 0-0.050 0-0.075 0 0 0 0 0-0.006 0 0 0-0.006 0-0.006s0-0.006 0-0.006c0 0 0-0.006 0-0.006s0-0.006 0-0.006c0 0 0 0 0 0-0.006-0.106-0.031-0.206-0.069-0.3 0 0 0 0 0-0.006 0 0 0 0 0-0.006 0 0 0-0.006-0.006-0.006 0 0 0 0 0 0-0.038-0.094-0.094-0.175-0.156-0.256 0 0 0 0 0 0s-0.006-0.006-0.006-0.006c0 0 0 0 0 0s-0.006-0.006-0.006-0.006-0.006-0.006-0.006-0.006 0 0 0-0.006c-0.012-0.012-0.025-0.025-0.037-0.037l-6-6c-0.387-0.387-1.025-0.387-1.413 0s-0.387 1.025 0 1.413l4.294 4.294h-13.581c-0.55 0-1 0.45-1 1s0.45 1 1 1h13.587l-4.294 4.294c-0.387 0.387-0.387 1.025 0 1.413 0.194 0.194 0.45 0.294 0.706 0.294s0.513-0.1 0.706-0.294l5.994-5.994c0.019-0.025 0.031-0.044 0.044-0.056z") + + symbol#svg_book(viewBox="0 0 20 20") + path(d="M15.5 11h-11c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h11c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5zM15.5 7h-11c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h11c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5zM10.5 15h-6c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h6c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5zM15.5 3h-11c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h11c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5z") + + symbol#svg_yes(viewBox="0 0 24 24") + path(d="M9.984 17.016l9-9-1.406-1.453-7.594 7.594-3.563-3.563-1.406 1.406zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z") + + symbol#svg_no(viewBox="0 0 24 24") + path(d="M17.016 15.609l-3.609-3.609 3.609-3.609-1.406-1.406-3.609 3.609-3.609-3.609-1.406 1.406 3.609 3.609-3.609 3.609 1.406 1.406 3.609-3.609 3.609 3.609zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z") + + symbol#svg_neutral(viewBox="0 0 24 24") + path(d="M12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z") + + symbol#svg_chat(viewBox="0 0 30 30") + path(d="M28.74 25.2c-1.73-.3-3.77-1.46-4.74-3.6 3.64-2.2 6-5.68 6-9.6 0-6.63-6.72-12-15-12S0 5.37 0 12s6.72 12 15 12c1.1 0 2.2-.1 3.23-.3 2.86 2 6.25 2.62 10.4 2.15.26-.02.37-.15.37-.32 0-.16-.1-.3-.26-.32zM23 14c0 .55-.45 1-1 1H8c-.55 0-1-.45-1-1s.45-1 1-1h14c.55 0 1 .45 1 1zm0-4c0 .55-.45 1-1 1H8c-.55 0-1-.45-1-1s.45-1 1-1h14c.55 0 1 .45 1 1z") + + symbol#svg_star(viewBox="0 0 24 24") + path(d="M12 17.25l-6.188 3.75 1.641-7.031-5.438-4.734 7.172-0.609 2.813-6.609 2.813 6.609 7.172 0.609-5.438 4.734 1.641 7.031z") + + symbol#svg_help(viewBox="0 0 24 28") + path(d="M14 21.5v-3c0-0.281-0.219-0.5-0.5-0.5h-3c-0.281 0-0.5 0.219-0.5 0.5v3c0 0.281 0.219 0.5 0.5 0.5h3c0.281 0 0.5-0.219 0.5-0.5zM18 11c0-2.859-3-5-5.688-5-2.547 0-4.453 1.094-5.797 3.328-0.141 0.219-0.078 0.5 0.125 0.656l2.063 1.563c0.078 0.063 0.187 0.094 0.297 0.094 0.141 0 0.297-0.063 0.391-0.187 0.734-0.938 1.047-1.219 1.344-1.437 0.266-0.187 0.781-0.375 1.344-0.375 1 0 1.922 0.641 1.922 1.328 0 0.812-0.422 1.219-1.375 1.656-1.109 0.5-2.625 1.797-2.625 3.313v0.562c0 0.281 0.219 0.5 0.5 0.5h3c0.281 0 0.5-0.219 0.5-0.5v0c0-0.359 0.453-1.125 1.188-1.547 1.188-0.672 2.812-1.578 2.812-3.953zM24 14c0 6.625-5.375 12-12 12s-12-5.375-12-12 5.375-12 12-12 12 5.375 12 12z") + + symbol#svg_help_o(viewBox="0 0 24 28") + path(d="M13.75 18.75v2.5c0 0.281-0.219 0.5-0.5 0.5h-2.5c-0.281 0-0.5-0.219-0.5-0.5v-2.5c0-0.281 0.219-0.5 0.5-0.5h2.5c0.281 0 0.5 0.219 0.5 0.5zM17.75 11c0 2.219-1.547 3.094-2.688 3.734-0.812 0.469-1.313 0.766-1.313 1.266v0.5c0 0.281-0.219 0.5-0.5 0.5h-2.5c-0.281 0-0.5-0.219-0.5-0.5v-1.062c0-1.922 1.375-2.531 2.484-3.031 0.938-0.438 1.516-0.734 1.516-1.437 0-0.906-1.141-1.578-2.172-1.578-0.547 0-1.125 0.172-1.484 0.422-0.344 0.234-0.672 0.578-1.25 1.297-0.094 0.125-0.234 0.187-0.391 0.187-0.109 0-0.219-0.031-0.297-0.094l-1.687-1.281c-0.203-0.156-0.25-0.453-0.109-0.672 1.281-2.016 3.078-3 5.453-3v0c2.562 0 5.437 2.031 5.437 4.75zM12 4c-5.516 0-10 4.484-10 10s4.484 10 10 10 10-4.484 10-10-4.484-10-10-10zM24 14c0 6.625-5.375 12-12 12s-12-5.375-12-12 5.375-12 12-12v0c6.625 0 12 5.375 12 12z") + + symbol#svg_reject(viewBox="0 0 24 24") + path(d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z") + + symbol#svg_accept(viewBox="0 0 24 24") + path(d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z") + + symbol#svg_markdown(viewBox="0 0 32 32") + path(d="M29.692 6h-27.385c-1.272 0-2.308 1.035-2.308 2.308v15.385c0 1.273 1.035 2.308 2.308 2.308h27.385c1.273 0 2.308-1.035 2.308-2.308v-15.385c0-1.272-1.035-2.308-2.308-2.308zM18 21.996l-4 0.004v-6l-3 3.846-3-3.846v6h-4v-12h4l3 4 3-4 4-0.004v12zM23.972 22.996l-4.972-6.996h3v-6h4v6h3l-5.028 6.996z") + + symbol#svg_course(viewBox="0 0 20 20") + path(d="M3.302 12.238c0.464 1.879 1.054 2.701 3.022 3.562 1.969 0.86 2.904 1.8 3.676 1.8s1.648-0.822 3.616-1.684c1.969-0.861 1.443-1.123 1.907-3.002l-5.523 2.686-6.698-3.362zM19.511 7.336l-8.325-4.662c-0.652-0.365-1.72-0.365-2.372 0l-8.326 4.662c-0.652 0.365-0.652 0.963 0 1.328l8.325 4.662c0.652 0.365 1.72 0.365 2.372 0l5.382-3.014-5.836-1.367c-0.225 0.055-0.472 0.086-0.731 0.086-1.052 0-1.904-0.506-1.904-1.131 0-0.627 0.853-1.133 1.904-1.133 0.816 0 1.51 0.307 1.78 0.734l6.182 2.029 1.549-0.867c0.651-0.364 0.651-0.962 0-1.327zM16.967 16.17c-0.065 0.385 1.283 1.018 1.411-0.107 0.579-5.072-0.416-6.531-0.416-6.531l-1.395 0.781c0-0.001 1.183 1.125 0.4 5.857z") + + symbol#svg_jupyter(viewBox="245 20 270 270") + path(d="M379.3 231.3c-42.6 0-79.7-15.3-99-38 14.6 40.6 53.4 69.6 99 69.6 45.5 0 84.3-29 99-69.7-19.4 22.7-56.5 38-99 38M379.3 84c42.5 0 79.6 15.4 99 38-14.7-40.6-53.5-69.6-99-69.6-45.6 0-84.4 29-99 69.6 19.3-22.6 56.4-38 99-38") + path(d="M299.5 286c-9.2.3-17-6.8-17.3-16-.4-9 6.7-16.8 15.8-17.2 9.2-.4 17 6.7 17.3 15.8.4 9.2-6.7 17-15.8 17.3zM286.8 70.4c-5.7.2-10.6-4.2-10.8-10-.3-5.6 4.2-10.5 10-10.7 5.6-.2 10.4 4.2 10.6 10 .3 5.6-4 10.4-9.8 10.7zM451 58c-9 .5-16.4-6.4-16.8-15.3-.3-8.8 6.5-16.3 15.4-16.7 9-.4 16.4 6.5 16.7 15.4.4 8.8-6.5 16.3-15.3 16.7z" ) + + + //- Logos + + symbol#svg_spacy(viewBox="0 0 675 215") + path(fill="currentColor" d="M83.6 83.3C68.3 81.5 67.2 61 47.5 62.8c-9.5 0-18.4 4-18.4 12.7 0 13.2 20.3 14.4 32.5 17.7 20.9 6.3 41 10.7 41 33.3 0 28.8-22.6 38.8-52.4 38.8-24.9 0-50.2-8.9-50.2-31.8 0-6.4 6.1-11.3 12-11.3 7.5 0 10.1 3.2 12.7 8.4 5.8 10.2 12.3 15.6 28.3 15.6 10.2 0 20.6-3.9 20.6-12.7 0-12.6-12.8-15.3-26.1-18.4-23.5-6.6-43.6-10-46-36.1C-1 34.5 91.7 32.9 97 71.9c.1 7.1-6.5 11.4-13.4 11.4zm110.2-39c32.5 0 51 27.2 51 60.8 0 33.7-17.9 60.8-51 60.8-18.4 0-29.8-7.8-38.1-19.8v44.5c0 13.4-4.3 19.8-14.1 19.8-11.9 0-14.1-7.6-14.1-19.8V61.3c0-10.6 4.4-17 14.1-17 9.1 0 14.1 7.2 14.1 17v3.6c9.2-11.6 19.7-20.6 38.1-20.6zm-7.7 98.4c19.1 0 27.6-17.6 27.6-38.1 0-20.1-8.6-38.1-27.6-38.1-19.8 0-29 16.3-29 38.1 0 21.2 9.2 38.1 29 38.1zM266.9 76c0-23.4 26.9-31.7 52.9-31.7 36.6 0 51.7 10.7 51.7 46v34c0 8.1 5 24.1 5 29 0 7.4-6.8 12-14.1 12-8.1 0-14.1-9.5-18.4-16.3-11.9 9.5-24.5 16.3-43.8 16.3-21.3 0-38.1-12.6-38.1-33.3 0-18.4 13.2-28.9 29-32.5 0 .1 51-12 51-12.1 0-15.7-5.5-22.6-22-22.6-14.5 0-21.9 4-27.5 12.7-4.5 6.6-4 10.6-12.7 10.6-6.9-.1-13-4.9-13-12.1zm43.6 70.2c22.3 0 31.8-11.8 31.8-35.3v-5c-6 2-30.3 8-36.8 9.1-7 1.4-14.1 6.6-14.1 14.9.1 9.1 9.4 16.3 19.1 16.3zM474.5 0c31.5 0 65.7 18.8 65.7 48.8 0 7.7-5.8 14.1-13.4 14.1-10.3 0-11.8-5.5-16.3-13.4-7.6-13.9-16.5-23.3-36.1-23.3-30.2-.2-43.7 25.6-43.7 57.8 0 32.4 11.2 55.8 42.4 55.8 20.7 0 32.2-12 38.1-27.6 2.4-7.1 6.7-14.1 15.6-14.1 7 0 14.1 7.2 14.1 14.8 0 31.8-32.4 53.8-65.8 53.8-36.5 0-57.2-15.4-68.5-41-5.5-12.2-9.1-24.9-9.1-42.4-.1-49.2 28.6-83.3 77-83.3zm180.3 44.3c8 0 12.7 5.2 12.7 13.4 0 3.3-2.6 9.9-3.6 13.4L625.1 173c-8.6 22.1-15.1 37.4-44.5 37.4-14 0-26.1-1.2-26.1-13.4 0-7 5.3-10.6 12.7-10.6 1.4 0 3.6.7 5 .7 2.1 0 3.6.7 5 .7 14.7 0 16.8-15.1 22-25.5l-37.4-92.6c-2.1-5-3.6-8.4-3.6-11.3 0-8.2 6.4-14.1 14.8-14.1 9.5 0 13.3 7.5 15.6 15.6l24.7 73.5L638 65.5c3.9-10.5 4.2-21.2 16.8-21.2z" ) + + symbol#svg_explosion(viewBox="0 0 500 500") + path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z") + + symbol#svg_prodigy(viewBox="0 0 538.5 157.6") + path(fill="currentColor" d="M70.6 48.6c7 7.3 10.5 17.1 10.5 29.2S77.7 99.7 70.6 107c-6.9 7.3-15.9 11.1-27 11.1-9.4 0-16.8-2.7-21.7-8.2v44.8H0V39h20.7v8.1c4.8-6.4 12.4-9.6 22.9-9.6 11.1 0 20.1 3.7 27 11.1zM21.9 76v3.6c0 12.1 7.3 19.8 18.3 19.8 11.2 0 18.7-7.9 18.7-21.6s-7.5-21.6-18.7-21.6c-11 0-18.3 7.7-18.3 19.8zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21.1v8.8c4-6.4 11.2-9.6 21.3-9.6v21.2zM209.5 107.1c-7.6 7.3-17.5 11.1-29.5 11.1s-21.9-3.8-29.7-11.1c-7.6-7.5-11.5-17.2-11.5-29.2 0-12.1 3.9-21.9 11.5-29.2 7.8-7.3 17.7-11.1 29.7-11.1s21.9 3.8 29.5 11.1c7.8 7.3 11.7 17.1 11.7 29.2 0 11.9-3.9 21.7-11.7 29.2zM180 56.2c-5.7 0-10.3 1.9-13.8 5.8-3.5 3.8-5.2 9-5.2 15.7 0 6.7 1.8 12 5.2 15.7 3.4 3.8 8.1 5.7 13.8 5.7s10.3-1.9 13.8-5.7 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8.1-5.8-13.8-5.8zM313 116.5h-20.5v-7.9c-4.4 5.5-12.7 9.6-23.1 9.6-10.9 0-19.9-3.8-27-11.1C235.5 99.7 232 90 232 77.8s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 9.7 0 17.1 2.7 21.9 8.2V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.9 0 18.3-7.3 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM354.1 13.6c0 3.6-1.3 6.8-3.9 9.3-5 4.9-13.6 4.9-18.6 0-8.4-7.5-1.6-23.1 9.3-22.5 7.4 0 13.2 5.9 13.2 13.2zm-2.2 102.9H330V39h21.9v77.5zM425.1 47.1V39h20.5v80.4c0 11.2-3.6 20.1-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-36.9-11.4-39.9-29.8l21.7-.8c1 7.6 7.6 12 17.4 12 11.2 0 18.1-5.8 18.1-16.6v-11.1c-5.1 5.5-12.5 8.2-21.9 8.2-10.9 0-19.9-3.8-27-11.1-6.9-7.3-10.3-17.1-10.3-29.2s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 10.7 0 18.4 3.1 23.2 9.6zm-38.3 30.7c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45.1L454.5 39h24.1l17.8 46.2L514.2 39h24.3l-49.7 115.8z") + + + //- Machine learning & NLP libraries + + symbol#svg_tensorflow(viewBox="0 0 31 33") + path(d="M17.3 5v5l8.7 5v-5zM0 10v5l4.3 2.5v-5zm13 2.5L8.7 15v15l4.3 2.5v-10l4.3 2.5v-5L13 17.5z" fill="#e55b2d") + path(d="M17.3 5l-13 7.5v5l8.7-5v5l4.3-2.5zm13 2.5L26 10v5l4.3-2.5zm-8.6 10L17.3 20v5l4.4-2.5zM17.3 25L13 22.5v10l4.3-2.5z" fill="#ed8e24") + path(d="M17.3 0L0 10l4.3 2.5 13-7.5 8.7 5 4.3-2.5zm0 15L13 17.5l4.3 2.5 4.4-2.5z" fill="#f8bf3c") + + symbol#svg_keras(viewBox="0 0 512 512") + path(fill="#D01317" d="M64 64h384v384H64z") + path(fill="#F6F6F6" d="M162 145v222l7 8h22.8l10.2-8.3V292l30-32 76 115h35l7.3-12.3-89.5-129.3 83.3-82.6L338 137h-35L202 239v-93.7l-8-8.3h-24.4") + + symbol#svg_pytorch(viewBox="0 0 200 41") + path(fill="#F05732" d="M102.7 12.2c-1.3-1-1.8 4-4.4 4-3 0-4-13-6.3-13-.7 0-.8-.5-8 21.2-2.8 9 4.5 15.8 12 15.8 4.5 0 12.2-3 12.2-12.6 0-7-3.5-14-5.5-15.4zm-7 23c-3.6 0-6.6-3-6.6-7 0-3.8 3-7 6.8-7s6.7 3.2 6.7 7c0 4-3 7-6.7 7z") + path(fill="#9E529F" d="M99.8 0C99.3 0 98 2.5 98 3.6c0 1.5 1 2 1.8 2s1.8-.5 1.8-2c0-1-1.4-3.6-1.8-3.6z") + path(fill="#333333" d="M0 39.5V15h11.5c5.3 0 8.3 3.5 8.3 7.8s-3 8-8.3 8H5.2v8.7H0zm14.4-16.7c0-2-1.6-3.3-3.7-3.3H5.2V26h5.5c2 0 3.7-1.2 3.7-3.2zM35.2 39.5v-10l-9.4-14.6h6l6 9.7L44 15h6l-9.5 14.4v10h-5.3zM63.3 39.5v-20H56V15h19.7v4.5h-7.2v20h-5.2zM131.4 39.5l-4.8-8.7h-3.8v8.7h-5.2V15H129c5 0 8.3 3.3 8.3 7.8 0 4.3-2.8 6.7-5.4 7.3l5.5 9.5h-6zm.5-16.7c0-2-1.7-3.3-3.8-3.3h-5.5V26h5.5c2 0 3.7-1 3.7-3.2zM145.6 27.2c0-7.6 5.7-12.7 13-12.7 5.5 0 8.6 3 10.4 6l-4.5 2.2c-1-2-3.2-3.6-5.8-3.6-4.5 0-7.7 3.5-7.7 8.2 0 4.6 3.2 8 7.7 8 2.5 0 4.7-1.5 5.8-3.5L169 34c-1.7 3-5 6-10.3 6-7.4 0-13-5.3-13-12.8zM194.5 39.5V29H183v10.5h-5.3V15h5.2v9.6h11.5V15h5.3v24.5h-5.3z") + + symbol#svg_scikitlearn(viewBox="0 0 278 150") + path(fill="#f89939" d="M212.7 127.1c33.9-33.9 39.5-83.17 12.6-110.1S149.1-4.3 115.24 29.57c-33.87 33.86-24.07 98.56-12.57 110.06 9.3 9.3 76.2 21.3 110.06-12.57z") + path(fill="#3499cd" d="M73.74 78.15C54.1 58.5 25.5 55.23 9.87 70.85c-15.62 15.63-12.35 44.23 7.3 63.87 19.65 19.65 57.2 13.97 63.86 7.3 5.4-5.4 12.37-44.22-7.3-63.87z") + path(fill="#010101" d="M141.53 118.8c-3.47 3.18-6.5 5.54-9.13 7.05-2.62 1.52-5.1 2.28-7.5 2.28-2.72 0-4.92-1.06-6.6-3.17-1.67-2.13-2.5-4.97-2.5-8.55 0-5.34 1.16-11.77 3.48-19.3 2.32-7.5 5.15-14.4 8.47-20.74l9.73-3.6c.3-.1.53-.16.7-.16.73 0 1.34.54 1.8 1.62.48 1.1.7 2.55.7 4.4 0 5.2-1.18 10.24-3.6 15.12-2.4 4.88-6.14 10.08-11.25 15.63-.2 2.65-.3 4.48-.3 5.47 0 2.23.4 3.98 1.22 5.3.82 1.3 1.9 1.94 3.26 1.94 1.4 0 2.86-.5 4.4-1.5 1.58-1 3.95-3.05 7.14-6.2v4.4zm-14.66-15c3.24-3.6 5.87-7.63 7.9-12.1 2-4.48 3-8.33 3-11.56 0-.94-.13-1.7-.4-2.27-.3-.58-.65-.87-1.08-.87-.96 0-2.34 2.35-4.14 7.06-1.8 4.7-3.57 11.3-5.3 19.75z M170.2 118.8c-3.26 3.18-6.17 5.54-8.75 7.05-2.58 1.52-5.42 2.28-8.54 2.28-3.45 0-6.25-1.1-8.4-3.33-2.12-2.22-3.2-5.14-3.2-8.77 0-5.4 1.9-10.3 5.65-14.7 3.75-4.37 7.92-6.56 12.5-6.56 2.36 0 4.26.62 5.7 1.84 1.42 1.25 2.14 2.86 2.14 4.85 0 5.3-5.63 9.57-16.86 12.87 1.02 4.98 3.68 7.48 8 7.48 1.7 0 3.3-.46 4.82-1.36 1.54-.9 3.85-2.92 6.94-6.04v4.4zm-20.08-7.1c6.53-1.84 9.8-5.23 9.8-10.2 0-2.44-.9-3.66-2.68-3.66-1.68 0-3.3 1.28-4.82 3.85-1.54 2.54-2.3 5.9-2.3 10z M211.1 118.8c-4.1 3.87-7.02 6.4-8.78 7.57-1.76 1.18-3.44 1.76-5.05 1.76-4.04 0-5.94-3.56-5.7-10.68-2.56 3.65-4.92 6.34-7.08 8.08-2.18 1.74-4.4 2.6-6.7 2.6-2.25 0-4.16-1.05-5.73-3.15-1.57-2.1-2.35-4.7-2.35-7.76 0-3.83 1.05-7.48 3.15-10.96 2.1-3.47 4.8-6.28 8.1-8.42 3.3-2.15 6.2-3.22 8.74-3.22 3.2 0 5.44 1.47 6.7 4.4l7.84-4.32h2.15l-3.4 11.22c-1.73 5.64-2.6 9.5-2.6 11.6 0 2.2.78 3.3 2.34 3.3 1 0 2.1-.53 3.3-1.6 1.2-1.05 2.87-2.67 5.04-4.84v4.4zm-28.04 2.1c2.55 0 4.95-2.18 7.22-6.53 2.26-4.36 3.4-8.38 3.4-12.05 0-1.43-.33-2.55-.97-3.35-.62-.8-1.5-1.2-2.55-1.2-2.55 0-4.97 2.16-7.25 6.5-2.3 4.34-3.44 8.34-3.44 12 0 1.37.34 2.5 1 3.34.7.88 1.55 1.3 2.6 1.3z M239.7 118.8c-6.4 6.27-11.35 9.4-14.82 9.4-1.56 0-2.87-.65-3.94-1.96-1.07-1.3-1.6-2.94-1.6-4.9 0-3.6 1.92-8.4 5.77-14.46-1.87.97-3.93 1.64-6.2 2.03-1.64 3.08-4.25 6.38-7.8 9.9h-.9v-3.45c2-2.07 3.8-4.3 5.4-6.66-2.17-.98-3.27-2.43-3.27-4.34 0-1.97.67-4.07 2-6.3 1.35-2.24 3.2-3.35 5.55-3.35 2 0 2.98 1.03 2.98 3.07 0 1.6-.57 3.9-1.72 6.9 4.24-.47 7.94-3.7 11.1-9.73l3.5-.16-3.57 9.8c-1.48 4.14-2.44 6.96-2.87 8.45-.45 1.5-.66 2.82-.66 3.97 0 1.06.25 1.92.74 2.55.5.65 1.17.96 2 .96.93 0 1.8-.3 2.66-.9.84-.65 2.73-2.37 5.67-5.2v4.4z M277.32 118.8c-5.9 6.22-10.96 9.33-15.17 9.33-1.7 0-3.1-.6-4.14-1.8-1.02-1.2-1.55-2.8-1.55-4.82 0-2.7 1.13-6.9 3.38-12.5 1.2-3.02 1.8-4.94 1.8-5.75 0-.82-.33-1.23-.96-1.23-.36 0-.83.2-1.42.55-.54.36-1.16.86-1.88 1.5-.63.58-1.35 1.3-2.14 2.14-.7.7-1.43 1.53-2.22 2.45l-2.14 2.5c-.94 1.14-1.53 2.35-1.76 3.63-.4 2.17-.65 4.16-.78 5.98-.08 1.35-.1 3.17-.1 5.47l-8.48 2c-.28-3.46-.42-6.02-.42-7.7 0-4.12.48-8 1.44-11.7.96-3.67 2.5-7.8 4.6-12.4l9.36-1.8c-1.97 5.3-3.26 9.45-3.87 12.5 4.2-4.7 7.5-7.92 9.97-9.72 2.46-1.8 4.64-2.7 6.56-2.7 1.3 0 2.38.5 3.25 1.48.87.98 1.3 2.22 1.3 3.7 0 2.44-1.1 6.47-3.3 12.1-1.5 3.85-2.26 6.35-2.26 7.5 0 1.54.63 2.3 1.9 2.3 1.85 0 4.88-2.46 9.05-7.38v4.4z") + text(x="153.33" y="81.95" fill="#fff" font-size="23.08" font-family="Helvetica") scikit + + symbol#svg_dynet(viewBox="0 0 400 170") + image(overflow="visible" width="402" height="169" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZIAAACpCAYAAAABSLsgAAAACXBIWXMAAAsSAAALEgHS3X78AAAY oElEQVR42u2dT3LbuNbFT1yey9+YrLLeCqy3AjELSEVZgeUBx1JWEHoFoccahF7BkysLaHoFsVbQ UpU0ftYK/Aa8/KJWAJKS+QcAz6/K1dWUY9MgiINzLy7w4e3tDYQQQsi5XLAJCCGEUEgIIYR0xiWb gLjKw8PPEYA5gCsAyWz2aclWIaR+PjBHQhwVkSGAFwCDg8tfKCaE1A9DW8RVJkciAnEnhBAKCSGV CNgEhFBICKlbSFI2CyEUEkJKkfzIQPHRC1uHEAoJIee6EToSQigkhFRmpLi2mc0+vbJpCKGQEHKu I6EbIaQhWJBIXORGcY35EeI8nh8OAQwBvO62i9b6/KUFDTOSGeaVYqaZygCR7rYLhi0IHh5+BpqP 6EiI6yIyBfDj4P/vdttF0lshkQaZiHAMCr51fPBvngDEu+2CA0a/UQrJbPaJjoS4LCJXhyIi/ADQ LyERSzYHMC0RDx2fAXwWQZnSofQWVaL9mc1CetjvW6PzZLvnh1eeH8YA/gYwO1NEjgVlLSExwhcK YFiLEHeFxPPDCYC1CEidDACkFJN+IYWI14qPGNYipEE6CW1JPC8R91CVZxGd9cHMc6QZOA7FJGhz 9QIx0t7TkRDikpCIS0igXqJ5yF6+b1mUQPf8MAAQa35eLiZD5kx6QaC4xkJEQlwSEhGRFMV5kA2A qOqyNRGZkeeHEYBvGjFZgrvB9tWR0I0Q0jCt5UgqiMgewN1uuxies/Z5t11EAO40H49lSTFxm7Hi GsOahLggJJITKRKRZwDD9xbPyL//qvk4lvsgDsJCRELcdyRFIvKw2y6CunIYu+0iBvCk+GgAnpDn MspEOwsRCXFASKRGRJdYv9ttF00M7lNkobJj5nQlzhJonC4hxGYhkRVVuhqRr03tAyPuJta4kikf e28cScpmIcR+R6ITikcJQTWGJN+VroSP3S1YiEiIo0Iiy3FVL/eqxcFcJVbXrHjvhRuhIyHEZiGR PIROLNrcUFHneuhK3CJQXGMhIiGWO5I51Ku07tvcrkQE66niwEPcciR0I4RYLiRTxbV9gUNokqXi GsNbbsFCREJcEhLZ0VeVG4k72u9qqbk+4eO3HxYiEuKmIzHJjeThLVU9QcDH7wQsRCTEJSGRJLtq a/hlx7vvqmanYz5+J1BNCFiISIjFjkSXd4g7/jtTjfAxT+KmI0nZLIRYKiSypftxEeCm64OlCs4z oZBYDAsRCXHTkQBZEnuTiwjMSWqvKCS9cCN0JIS0TO0HW8nsf2jg3/qCPzePpJDYTaC4xkJEQhxw JKayVlwbsgs450joRgihkDSGaoC5ZhewGhYiEkIh6R7PD+lKLISFiIRQSFqnYOUWhcROWIhICIWE kHehciQsRCSEQtI4+6ozW2KlI0nZLIRQSJpGFfbgGe6WwUJEQigkhDThRuhICKGQEFKZQHGNhYiE UEha4bXioETscyR0I4RQSFqBMXQ3YCEiIRQSQs6DhYiEUEgIeS8sRCTEMC7ZBESFHPoVIwsjbQDM d9vF0oBbUzkSFiKyv17JJCNAtlvFENnS/hvFt+/xOxSaItvQNd1tF2u2JIUEnh8GR4PNCL/rRIao cZNGzw8jAHMAAwCPMtC6tGooFxFIuyWeHw4N+BuNTrR7fhgDmMlgley2i3kLfT446OtjjdCupZ2c GTBlsjOVv//mhH86OGin8cHP2wNYSjs1djy43PcSLWwa6/nhW80/ci9jXXJ48cPb25uNHSh/cYby 35F0jnN43m0XwRm//6+jy/e77SJyaHb3X8VHHwv2LGscKUT8W/HRl9ns09KAdpsD+H50+e74pXvn 7xgiOyxuohGNKjwBiLt8lu/sm3MRkCYH4lxUkrrbyfPDNezfefz/DoX20pKOExx83RhwWyrhmXt+ GDviSnQFf2tD78uUAVF1GuiwpvdgIoPn5xp+3GcAnz0/tMZJi4BGAG5b+pUD+V23nh8+A4hqFJRr R8aI1GghkRn/xCDhqDKgDpCFg6YOdBKVUO4NCImo7svZQkSZRE1lBt7E4HMLIPD8cLLbLl4MboMI WbiwK8YA/qpRUDYOiMk/xoJLgzrM5MCyD0xuwd12kUiO5Lgz3Hp+GDkQg1bN/F8Mva8UDnKUg2uS awCp54eBaWIiY0JyYhvkifS0oH/k+aQ8PF51spoLyoMIyrkTmInc18DCrpnnSMwREkk6zRsQj+NV GYdhhrqscQTgh+a67a4kMHTAdr4Q0fPDqfSh6zP6eyozxfVRnx/JO6b7mQMRExMWU+QuJEH1MN4T fifIq0ziUsXvC06YyM4ATDw/nJ7jTkSwr2T8q2vT2BH+zM8BwMeaJ9HKv/eyw5dljveHrVby0uQv 0WvRrEpmeXQl5eI+ME1IXC9EPFpuXTU8EiNbhVVFTOcSMo40v2OALLkcGNAOVVY05X9/8l7xk3+/ lK/cCc1LnsW1uJOzF9nU6QA9Pzxp4K+byxY7SL7a4ly7vpdBIwXwYsiKE50riaFOvNqALqH9YuJ9 2V6IeEYO4BHZiquT/255Z4KDJcp/OD6ZZScdTjDjkvFhgyys1Ng9Sr3UsmKC/5vnh6mNK+CsEpJ3 CsizzBJSE5OB4kpUzuqzxJxt7FyqGenKgJBHoOkfNotIICGcstn3XgbYWlYF7raLueeHrwC+aSZH SQdtMdVMyv7RBm0usZeowtTzw0TGId34NUXPt+i5bKFzlM0wjnnKbaYlS2nn+LOmJH8hAwv7hOqe TXhJnEq0F7iCxgTkaJCMJIx0nIe4btuVVBCRZwDTLsLF0kZRyRjW+8PxLhts/ATVcyAreWGWttVh 7LaLVJYFjhVhAqtciVh5404edOlERGnjZYV34xFZCKfJwXMuE4eB4nrSUnuUiUgnhb4SSYlRvjhn L+3Vay4aeAARgF8VReQRwL9328Vot10kLYjIsKGfG5143SY3YsLM34kTESWJ+1LybmyQ7SDQ+Axc fr5KMG5E8NqY7f8oGKDvOhKRObJFPGUisunKKTnrSE5wIY3Z9TOF5KWGF7LIlUwM2ezwXCHZsBCx lvejSiiri9m37r4m8lmTziwtGCNar2s5NWflypZIxjgSsadpBRG5BzDcbReRQSGsuu4jKnhRbXYk Jsz6bc6PXHl+mJaIyEaceesDk0wSVhohaRJd8rp1EfH8cOj54RJZrrNMRB4BjCgiNTuSijOtJyiq IV1CXMmjwg5fd7mk8sQZ4rWhA7bNhYhl74YJ+12pcjbjBvtaVDDpbG27lhNXlK7kOdkygbFHSGRZ 3G3JTGtqUOMPG/75kaY9InSwpPJEJgWDTGc4XIio3I67I1IolgJ7fjiqe1CXEPg3zcd3bY0Vkq+K US2MFe22ixhEy8WZD+GqgojkFtCkF77RlT/iuB5Vv1fCfyajGrBXhp4/Ynsh4gZZ+MaIyUXBO9rE slbdgPzYRnt4fjiSUON/KojIA7JQPEWkIUdStCzOpJlWFeoeKHWuJPb80OTlzYGhs37VfdlciPiM LHxjWj/Y48/wTlBnH5Bk9lgjrE0fAHbKDgLPMobx+OamHEmJEzFqpqWw1G3M7tYykzlmAEPXm8sL btz+WgWOJLX0fXvcbReBoZOJNgZN3bgwbbJNJBqwriAiGwBf5BlRRJoSEkms60RkhSyUZeoDuDrR 1r/XlewV1+cyMzKNiaZtus6PDOFIISKy+P+0rwONTFZUz/KpqfC354eB54cvyGpVypLp9zJ+LUGa ExJJTs0KRCQwvCq9tQFc2iG2yJUEqhfcUDdioyO5syjU2xSR5nrt74Ms502QLectK0l4AvAvw0oS 3BQSWRqaWCwiukFp1eDvi21wJfJsbwwdrFUCZ9uJiA99FxHpY6rcyGPdJQGytPgF1arSP+62iwkr 09tzJInGGm4sERGdI2nsvktciUmrQALNdRMsvgv5Ec5y9Qe91fYeSBhrjWxpcVEYaw/g6267GLIm pEUhkX1nxpoHMrHIDnZxfGwsYnvMbRt7GVVElR/ZGDJLc/5ExJ6g6mOrOvKpEsZKUb0qnct52xaS gyVzKmxbHjdse7YoIqtrv8iAkMMV1MeZdu5GXD8RsS8UhE6T9/ZdCWP9jfIq/Gf83giTDrEDR6I7 S+TJwrhvJ9t/SDuZ6kpMHqydPBGxh+j62NmTFVnO+wJ9hXxOvoNwwDBWR0Iig9yt5uFMLZsV6Tpz W7MTU12JKuSwN2QJZKCZWRL7hWR1Tuj0oCr9B6pXpSd8BN06Eu3gZ6E9HGrcQiuz2xJXEhgmJKas o+8ip0XaeY4nuYODLZl+oVoY61+77WLOMFbHQiKxc10SNnakM69avoepSa5E6oKMrGYvKERkeMI+ bt4zITjxkKm8Kn3NZjfDkUw1g0xk6d/Z+exWYrSq0My4I1di5G6/Bc+LjsQyCrYlWlf4t3lV+neU L+e9l+W8rErviMsTZs8bi+ONpiwjjZAtU1Rdb1tMVELyZEg4QNUWm9nsE2eadnHytkSSm42hXk34 R3+F4+ccWetImlquZ+CsqHUhMcWVFIS1mB8hXb2nVwdV6WUisgKr0o13JLqQh61uJDh1VtSRK0nQ /MFbZc/YFCFROciUr6sT795GM7HhIVMuORLoq1DXDnXmVVc3IwLW9eFXxoa1CgoR6UjcYH0gIEMe MuWuIxkbPFOtS0i6nt1G6OhIXkvDWpjNPtGROAIPmXLckRTE6VNLO+wIBi5x7fhIXtPDWoFmMCFu MET1Q6bueMiUhUKimw1avL2AycIY6a43vM28yau1dH2QA4mdqJ7bNaofMpWwCe0UEtdmg6rZ/cqE QbPgSN5rNHT4lbgdY8NaLER0jlPfMx4y5YiQDF15iQuWMZuU74nQ7uFXpoe1WIjYT3jIlGNCcuPQ S2z6oNnqkbwFW8Y/GjT7UzliFiJaSoWQ+GFVOl2nC0JSULhnq8WcqgYlAxN3bR3JOzVdWMH8iKtu Q0V+yFTEJnLLkZy8nYGpWBLWquJK6nzJVEKyN2x/IhYiuodqIvDMQ6bcFZKhxnraiC4slBh6vzpX Mqvj8Ctxm0Zve8NCxF4JybjhlYnEMCGx9SVWzb5Xpq5Hl5mZTvzqcCU2CCsLEd1E53gnbBo3hcQJ Cpa4Gr21QlNH8hacLWOasKocCQsRLUf62J5CQiGxDdUMfg87tnmJGnAlE0uElYn2frmSz3WEbQmF pCk3oipoW9qQ2GvoSN656cLKQkTnSTTXp2waCklfZvRtU1uupCDJbpqwshDRYWTlp2qCNGfSvfHJ dSvte+FQg+ncyKNN1bKyHLeuw690omRaWEv1d7EQ0S1aKbw1YeD2/DD1/PDN88PXto6GKCjTGJkg JEOLVDd2wI3U5qwsSrLrOjvdiFskUCfdvzmWK1nidz3UAMCPPuSCDoVEpWjXlvwdEdQJ5Ucb9+6p 6UjeKexZvcZCRMcpKLzNRcYFNzLR9OW2hGRvgiN51TROYPjDC6A+22BvqRspcx9VXzrjk+wACxF7 Rgx1rmTs+aHVIS6JAOjezbbykap3ppXx++JgxqB7cQNLH15s806i4kqeVC6xLO5akC9KDFy9xkLE frkSnWB8L9jvzwaWmgjAc4uh5O6FJP+jFd9jcuFQohkwV45sBnfuCq5pwYzQNFQdnYWI7orJUjNB AoDUxlVcnh8mUIe09mh3ibNq8jVoI+F/oVDVY25MTBZ5fhhDvS064Mj69HOO5JVQn6pTPxnq0Jho 7x9TqOP5A9vEREJyt7oJX5vvnIj0vovxsIqQVJkBt/3wptCf+Xzv2BnP0YnXrXEjLETsrSt5hT7S cWOLmMg49F3z8dNuu+jinVON4WNZCNCOkIh6rhTfd2uKK5GH90Pz8bNr5xuUuJLoqG2GmtnRytDj AFiI2F8xSQF8LRGToan3L+Es3Ti0QndREd34FzcpzhcnzFwTQ2yk7uFt4O5GcHNUO/wqssWNCIHq ObIQsTdiEmsmSbmYvJiWgJeCwyX04aw9gElXi1qKJp5NjgMXihtJoF+iF3f48JICG9npw2spFFBY GVzgRjbyTE2E+RGKybRATAYAfh077w5FZCT983PBOBQYkIvUtddtU8usLwpmwCpmbZX8Hz28tGQG EDiWF9G5iiJXEp3YqUyAhYikTEyArPo97cqdyEQ2BvAL+iJtY8YhEbJ7zcffmxCTC82NFC3R+9GG Mzl6eDc9F5EyVxJrhHZvqhthISI5UUzG4k6SNnMnMnFeQ7+4B8hyIkaNQ5IrXhWISVJnzqRor60p 1CGu3Jk0Er8UAYkqPLzeiEgFV3Jb8P2mwkJEohKTryXfdgvg7yYFRcagueeHa2Q52UHBtz8bPA4F 0B+XfossBxU0KiQHS/R0N3JT5wzB88OJ5EH+C+BbycNbARj1TETKKoNVQmubkLAQkWISA/hYMIk9 FpQXGfTfNQaJeByOQd9Rvtfg/W67CEzNzcp9FYnJNYC/JGz4LkG5LLmRXLHSgoH9FlkSZ4VsZVda NsDLQx/KHzmS/w6qRkV228W8xy9aIo6trJPHhi8+UHVchrUIdttFng9JoE9sH05ovyML12xkrHqR r1fVWCQhnZGMQSP5Gp9wixsAU0OX1J8zho9FUDbI6lBSAOtTJuqXJ9xIAn2u4vCBwvPDvLHXilno 4Mw2sebhtUAE/TJo493Iw8PPK7AQkZTPpicHY0+Vnciv84ntgWjUeVt7maBFlrVlnoZYlozh18jS CbOjttsDmBflWy+q3ojMIJ9OuP9rUbrDr8GZD+9+t10MKSK/XUmJ9bfRjbjgSF557/W7k912MQRw h/JwV1Pska2CGtpa8LzbLta77WIE4OGMf56fqzJ6l5DkM4TddjFBtfhlHWxsf3gNExe0W2z4vas6 pAuFiKoZ29LS/rSHQeeE7LaL5EBQ2sql/WMMcqFOTdIC/z6zDbUF35dn3EgKYCjL4uYlVukcnpBt d74EOeeh2tDhAwfdCHbbxdLzwy/IVjy+Sj9+seTeU88PPx7d+9rA+0wA5At8JnK/dY5BeZ5l6eoY lEeYJGw4R3keKkfbHz68vb2966bE7kzk69QHupcBJAXwQvGo3OYTAP9RvQQyazOah4efqk73dTb7 FPPpkjPehyF+L9zJv6qG0Vf4nZxP+7YSVNrv6mAMH0Gdj3qSiFQzQqK4qeAgfHFc8PJ6MPN8cXVL kxYe/FrzsL+YLsYPDz9HyIpMj/nIGhJS83uiGoP+34GxhSqN44Bm9VujQkIaf8ARsjqbY55320Vg gRuZQ7Fn2mz26QOfLiF2csEmsM7C62pobKmtYSEiIRQS0iEx1LHfR4tiuyrXxEJEQigkpAU3EkC9 umJvixthISIhFBLSnYhcQb+m36b17YHmOh0JIRQS0jBzzUx+1dG50OfiaiEiIRQSYrQbGUG9SisX GJsI6EYIoZCQ9kk01x8sXAvPExEJoZCQlt1IBPVuARuYfYTun6qXFSKCjoQQCglpT0QC6ENaUwt3 BQhUF1nNTgiFhDQjIkWrtB4s3d6BhYiEUEhIiyRQr9KyLqRV4kgY1iKEQkIacCNT6Ld1tjGkxUJE QigkpEURGUF/hO69xTuWBprrdCSEUEhIjSJyBf1peivLT4lkISIhFBLSAkuowz97FBxxabEjoRsh hEJCanQjCdTFekCWF7F95s5CREIoJKRBEZkCuNV8fG/78cMsRCSEQkKaFxFdcv3J8rxITqC6yEJE Qigk5P0iMkJ2UJWKFYCpI38qCxEJoZCQhkQkhfq0wz2AiY31Iic4Eoa1CKGQkAZFJHAguQ6AhYiE UEhI2yICZCu0XJqtB5rrdCSEUEhIAyJyZ/sKLQUsRCSEQkJaFJHEwT89oBshhEJC3i8iUwC/eigi AAsRCaGQkHeLSAR9nYjTIsJCREL6wyWboBEBuUJWI3LbRxERAtVFFiISQiEh5SIyQnYw1U2PRQRg ISIhFBJylojMkZ1gqMuH5HUifQjvqBwJw1qEUEiIRkCG4kLGBd+2gnt1IkpYiEhIv2CyvR4X8lIi Ik89ciI6N0JHQggdCTkSkABZQv2m5FvvHdnF9xRYiEgIhYSUCEhU4kAAYIMslJX2sJkCuhFCKCTk fAEBslDW1KEdfE+FhYiEUEiIiMcVsnNB5lAnj1UuZO7gnlmVYSEiIRQSike2AisAMAHw+YR/eg8g 7rELyQlUF1mISAiFxGXhGCFLDo9kELw58Uc8AohcOUOkBliISAiFxDmRmBxdHsrX1RmicTw4Rj1N pp/qSBjWIoRCYqWITFG8YeK50IFoYCEiIRQS14hr/FkrZJXrCXMgJ7sROhJCKCTWMqhBPFIRDw6E 1WAhIiEUEqd4RrWaj0PheBHxSBm6qg2KMCEUEmuZIAtvDRWfpQeD3JqOozaWAL4dXUvYLIS4zYe3 tze2AqmNh4efAbIizisAMetHCKGQEEIIIYVwG3lCCCHv4n8phXIdiL9d2wAAAABJRU5ErkJggg==") + + symbol#svg_chainer(viewBox="0 0 150 80") + image(overflow="visible" width="150" height="80" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAJYAAABQCAYAAAD7sIxLAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJ bWFnZVJlYWR5ccllPAAADElJREFUeNrsXQuUVVUZ3jMNqMP4YDSRhzIkYSryMKwIH4MKkhpioi3x sULwUaJpgpXDwpnEwAxFZZbSkuVIiqJEFMpr5SQwhIoOKILNKAppGJoEihYMj76v85/u5nI5r3su 955z9r/Wt8695+69z95nf3c//v3vfxepmMpGVX4QLmcAvYAy4AtgLbC4g9r8WcjP6o3LmUBbYDPw ErACz9mjEipFMSRUZ1zuBq5wCPY2MB54FpX/7yyeNRSXh4GjMvy8G5gD3A8sSxrJimJGqi64LAK6 eozyAnAJKn1rgGddisszHoPXA1V4zktJIVZxzMrzax+kopzjgxzp8isfYc8GloCMo0yLFc0WK2h3 8+z/ojvLDhmn2XJnwGd9Hy3XM3EnVtxarG0B47X3EKY1cISGpoDPuiEJLVZJzMrzCDA6QLxRaEVe 99k6nsKZH3CQz2cdbcZY0ZOJwDs+4/zRL6koiLMal6uB//iMOt8QK2KCyv5EWbqr2cAul+C7ZeA+ JIvnMf4JMuvzMr7jWK7KDN6jPZDn7PBG4Ba5tV1ZyssNomaYA2K8GuLzvo7Ld4XY/Hy49nMDMAHP m6eMxIJcgzlTBGbl4dltgW7AB8Am4KumRuJBqlbASiHW6XnMR3dgM7ABOCIp7784xmU7S1nrhG8C f8njuI/PvwboxLEfyPUlQ6xoy0/k+iQqd3eeJxVcM7wH6A9MMv1JdLvB9tIFsgs6vIDyNV/yNTru dRDXFqtGrguCLDDnUNglvgHcC3JdYNQN0WqtSnF5D/gyx1gg1hsFlj9q7JcCO4GeyN/fDbEKk0gc DNMe6jBlreFdCdwMvIxK+1aB5pka+8eVtSR0teSfC9zNyPM2Q6z8VQzX5y4Cfgqcup9gNOaboixj vg8LLP/9cPkd0C7Dz7RunUriyYzSEOsAVQoVjbXAAI9R1gHDUEmvFEj+Oc561OO7nwHcjbyvNcTK baUcKYPfDj6jsps5AxXUmOf8d8eFBD/ERzROPqYg72PNrDB3MisAqSgc0M9ExZblOf/f80kqCtUl Vcj7oihp7iNDLLzUb+BSmUUSXJQemediVGQRl13/LLyHIkOscGVgCGn0znMZXssyPm30qw2xwpWS AkkjG1mo9rabDyIjZFZsiBWSbAghjTX5LAAG4LRunZBlMh2BPoZY4cnTyr/ZcboUwu4YbhvL1tqi tSFWeP927li+I8tkarhAneeidA4wM9SFCtQVhljhkos248OVv21etEXn7mjqj4ZxgglyTQQ65WFm +yPpjjmJ2KKsjRV+l3CmRmHZJ3LWDXipdbj0VdaGCbcNDLRpH4Q45wHfxOeLlbXUw6WgVajocUDJ ASDUscAflLVi0Epmh5XI0/lCsgeVtSjtJLQpm4w4Y6JQT5FehEZl9VSWpSgr6CTgUGXtaObmBS6H LE038kOcNsraNPpLGausByYi3FQPz2svXRnf2zrE+chDnMHKWrM81m5xSOx0cx6Eq5CyXCBkO0pa My7ncBPGfMRpjkrdxHaXjocK5/LQfcqyhmDL/S4wGpX3+7Rwx+EyTlm2VEUZWhFu/XoAeEH3XIN4 JAa93lwnt/4J/BBhZiXh/SaWWBoBaFrzM2VZS1C4NWw8CPAifmML8luttXGSt6Tlmy4L5ZyB9pLf aH81Ar+9nZT3mnhiCbmKpDtll1UhLRFdDnFc5nfzA+N1VSmfWfdyNgpSfZ6kd2qItS/JODimoWC2 s8b3gWtBqIVJfI/Fhkr7zDrZwjwWQlJN0q0qQywjtvQPIY1zlaVzM8Qy8n/5Skjp3GOIZUSXf4SU Dv03nGuIZcSWJSGm9R1DLCO21IeYVidDLCP2zHCushauw5CNhlhGdKElwhchpDPTEMuI3mpxPyIX kHdkkczKJB0aYIjlnVxUcNJp2/oA0f8GXJrUd2eI5U4uWmuerCwLBq+t15+AgdLqJVLMWqGvUXh5 B+ker1cpywVbPgVockMLz+VJf1eGWNmRjBp6WqDSzHgtCLXDvBkjRowYMWLEiBEjRozEa1aI2c8x yjJYq1SWnylugTpSfuauXDpn5QlY1Nk8BdQ7nX+M9Hio0dPydYZYbBqJmZQ4EICHDdFQra9DfDoC s3ewkHgjgG2IS+f90/bjuJ/by20d0OKIqhqqVeqE1YvlgAAjTsTCSytX1tnK6Wa1XPOiRnmXtFRs 7cokDW6Tss+r4T06yH/UvF5DLJtUdEvIPXAnabcnSevzlsu/mMse3Jx5E7s6p+7QSIKIJT4MntdI RfPcK2Uh1lUQjs4ufox0uLX9rzF/b3XAi/L5TUMj5xbrIaCffF6vLC/DH/hNEHFejvtLQxnXq2AW D4mRYmmteihrYZXyMTAkCKmMGElvsX6uqR5qgxy+HcJMi16RaTnQW9QaStQY9LYyl74UAqZ7sEwu 2BrTp0I7KSsdePDA8JkH0v+7bOenq8f+Wn7oJedTKetsMdXJ9jmchJ0NdFOWVx1Oumgjtlha3GzS biP57yNaAboh4AI8zzBahPRfK+JJpCplZ9TCSsUPO3P4YqlqWClfn5QxCol9mEtUemu5GXl7yiOZ 6FN9jNrXvCVzFGs7/DyXdKlS+XPa7f5OpBc94NfkKz3WDFPu/iBYQSORbr3Pd8u6vAz4hXLeG7lF 1CWP4RmfeUiX5a6UPyP//N92iVJTovb2nb4wl6TKIFekfd+kLN9WzUJ2ku0SgK6E6GRjBgq5Uzz7 OclAIa0tW0VV0iR/Hgr1aRfKZIUmMM8h7eFI+/GQyzhIZd6yT99aqwCe89NGWi622vSI3IX5RX5u Q37u90iqo3H5jUp5zVFChCYhUlvgRGm9qH+k4eIQxLsQz3Cz7a9UKb2dLi3yp9wj9WMf0NBSIs2l Lc/loTveKSoNkqUxXU2Bgt+uLN/mVXKrWsI6CXfZ8J9Iz3njSVakuz1DZdB90bXAI9I91uLeEwi7 az/prpIu4EZgaMDykvBTMtnCi2M3buIYK/mZhHtzxduym8zWJl90lzQKWK63SEiLXS430NJz8wlS lmnA5T7LMFlZm0QabRs06eJ5ZB7dQq3hjTly6uceafJyPcbopT2vGejoMd48Ld5pHsIf6iNPt2tp n+8h/GSv7wy//0ALO9pjfu7S4tR5CD9GC98gZzY6jpGAFVqcQS7hq7Ww13udFebzaNs1Pg6C1Mc2 rucQehk7aKKfDNYlh+V91WO4h7Vxb2cP4UdoXf41bl2b+OqqSuvqvEqTZ3VDROQ97XNoLrXlIM1y 7VarAtCTbdQmOB1c8t9PujXKUq9+ShFukYxlKafmQt3QEhFi6ZOK0gAEKpUxBdc0u8tAs0wGtG0K sLz24Z1uhwWcqX1u8HlC2EeijuiaC2K9r31vFyclnagd6IKb7rd7OgTdraK7FU4/V2eiwK+0zQWx GkW/ouQfPTMmpOom0++ztNtcVaDrxnUyhmmW8nMcUx8DYgWV1rkg1gLtO5VrN8SAVGUy2LfHJ9Rh 1SjrAPKWDOGPi3BxD9YnHtlq1cOSYtmtu8VuEvGSz4lBg1WrkWoCyjgAaMhEqhiIfhDBMYWSKXtc MUm798CBOAYkx2Ifu8a1sWoVb9EnNQMKjVi09rT1PifL2CTK0k2u7yZgd7Ku3xtcUMTCy6dRH5cA 7OWU4Wi1pgc53BpxTgROz3O5bB+iFTFofR0FdbdMpY4E7oPy3llILRYzOD1tqnoVQLX/dV6OipUT rnh2zCsF0OLZZtQVwG0u+S4VlUSUZaz2mcsv9wGHuJSbf7paOegqdClJY/8deBCXA+6SW1Sc8bQq Hr9Gc+PVMkXnqnmpDBa58MiVbe7msc1B2GoNzeOBRDXaeINnE3LFnwvTPIhpu5SbvkGptab91/ER b7UWoIw0lRknt24FLsc9WvMuF/UKx2JcvuslZT5N6vBfwOs5JZZkcjwyxA0VPEOvh9zuKPA6Y+Re wSV5fNHLUIaRylpzayWk7xvzsRaJxWUvmpiXyZ/+IrW3GU0mOT7nLZZWMdzv1xOVQ2bfIi1XD5Wy t9GFg2NupKDykZsxprkcSMSZ2nDts1dp1OKt9UCuacg/N8ZSL3ee/DvbqdQy1seSd7bET8hYxavQ D5atL3IzaWnQ8u1nkwkJwsMzt3n8M9HUp042s4yU7p1lpp1WuZR1q4w/GZYmQHWIt9pD8gvkfX3u tQy+/GNxHKVSu6A50P8E2BRT/ZCRLOS/AgwAG01541UNsGkAAAAASUVORK5CYII=") + + symbol#svg_nltk(viewBox="0 0 93 28.5") + path(d="M1 .6H5l13.6 22V.6h3.5V28h-4L4.2 5.7v22H1V.6zM28.3.6H32v24h13.7v3.2H28.3V.6zM66.5.6v3.2h-9.2v24h-3.7v-24h-9.2V.6h22zM70 .6h3.5V14L87 .5h5.3l-11.6 11 12 16.3h-5L78 14l-4.4 4.2V28H70V.5z") + + symbol#svg_gensim(viewBox="0 0 100 100") + image(overflow="visible" width="100" height="100" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJ bWFnZVJlYWR5ccllPAAAFahJREFUeNrsXQdYFNfa/na2scAusPQmIIKIKCKKNfZ4NcYajSVqjCWJ udHEJJp2E/VP+/+oSRTT7HqNoJEYza+5KklQscQCKE0BAemwlIXt/Z4ZBAHZ3ZndWVxMPh8eZ2dn 58w577xfPWeGAXYuiZeLedv3Xppyr1wcg2GMngqlxpvLYXkZDAZ3lVrH1+n07LbHs5iYGn3fBAyo V6m1Ip4Du0qr1d8NDXJPf235iF+eGRSosef+MuzpYj7Yf9Xx1G+3o+oaZCO0Ov0QgwH6KlWacPQ/ h5bOMhhKLod5G8OwHCbGuOrl4Xxu4TMxua9NjVL9Dch92ZdSwNqdeC2ipFw8Xa3WrUJ3tTfZ3zo5 sCDExxF6ePEgyNsRIoME0MvfCdwFHKiTaCCvTAp5pRIoFSmgok4JRZUyqGtStzsHYlA5h8OM7+Hn +vP7q8flT+zjpf9LArJi02/Csxfy31YoNHM1Wl0g2oWR+d3wvkKYMNATooIFCAhHdNcTd35bGrRl RIfOIj2GACmokELG3UY4klIKYmmrBtOzWFgR34mbMHda9OefLh0q+UsAEvfCv0dVVDctUWt0s5Fu 55s7HtkNGB4phGGRbjAyyh283bjtBpoMGAzofL9aa4A7ZRJIzayFczdFcLdcCnpcP7KZ9Ww280d/ H8HeK7sX/vlYAjJk2cE+VTWSdyRS1XPoI9Pc8TwuE+Ii3OD1WT3BV+jw8CBTYAXZ39wqbIRdJwvh Sk4taHUGfJfOVcDb7evN33Rxx4KCxwKQeRtOOV6/WRbf0KhYhDwjtnnDCzBugCesndsLXJ3YnQ+y FawwCd79zw0SNXx7vACSzpcRjEEsVSFV9t38GQPeRqpM1S0BQcaau2HL2VeQa7oOGWofMkBMjvOG OaP8oE8PfueDTBEI0r8xsi1qVEHCb/fgJwSMWKrGVVmRkyNn07efzdyBjL+u2wAy4sVD/shr+kKu UM9CNxjL3PG4V7R8chA8PdQb2CzMNBA0soIs8zIKxLD75F24cKsGZ4vagctOCA5wXZe6Y0GN3QMS NW/Ps9UiyT6d3sAjc/z4mPbqyVpWkAaPQR288zdr4F+7bhJsQQGozM9HMDfj4JKTdgnI5DeSXHPz a7ZJZKrncLVr7ng+jwX/nB4C04b7AsYgP0h0soJ0G2228Thm5y/5kIhUGWpa7ezE3R0e4vHWma2z 5XSMI5OOk4x55XAgAiNBJldPIxNP+CCv6V8Lw+HJWK/WcSB7x7Z8x7j/r+N+OljR2kYn244oGB0R 5YW8QBZkFoqZEpk6Fjks/eImL0kpuvqj9JEzJHbx/mHlVU0HUVzRk8zxOBg710SDpyu3S1xZi9sg oR4LyiXwwv9eIrwyFPFnB/i6zP9zz8LMRwbIgIX7ppZVNu7T6w1CMsf3DxHAFyujwJnHotdWWDCw VMFr57212d8o08D6PRmQfL0SmBijwsuTvyA7cem5LldZCIzhCIyjCAw3Msf3DebDp0sjwY3PMaoW 7udBTKonKgNLqY0Oao8MGPgxDhwmjI3xgbzSJiiqlPIVSs1TwYNnJ4uyT1R1GSD3mYGD4UpWTX29 qj9SUw5Ws8KagSXLik5vBCPt4cJmMWHiYD9Qa/SQllfnpFBqZ4XEzbkmyjp+z+aA9Ju/d0hlddMR smrKw4UDu94cQIBBaWBpMNpUwHsISKDGQuQGw7C+niBXaiGdAEXzZMyERadLbySJqIwvRuVg5E0F iOqkh1CM4U7meB6i89tzw8DbzYE+W2EjVrRtmwwrOtvGE6FvzIuCUQN8QKfT+98rbTiEgmQPmwAy 54P/dykort1L1pvCZflTQTCqv4f1rGhzvK1Y0dFOWWqb8EzD1teGwLhYX8QWTf+SMvG3Sz49w6Nd ZclcnvgWxRkzyXpmeN1izexeBJXNDRIVD8qsuqEAnlH1RFG9dWwPB2UYilWOphSDRK6OKK0UOzfl nzpNG0P6ztszSypTLSJ7vJMDE96ZFw4cFkZKPZljBRl1YQl4xtQTpTjGSBvuLg6w+90n8EASU6q0 L8Us2j+eFkBGvZzoI6qVHiCTJGyR58YHEp6V2Y7awJW1Nq6wlBWdbfcPFcKyp8PBYDDwKqqbEoav +MHNKkCSrpeyi0rrN2t1eieyYESHuiDbEWxZ2oMGo20pKyixkIJKe3lmJMwYFQwajc6zpFz8kVU2 5EZNxEqJTP0mFVuzdm44BPk4WW0rLDXalrKCfhbiv8VQ9I5B3xAhHPm9EFRqXVTQoNmltTknblFm yKpt57jIiK8jU+VrkZherq1elbW2wlJ1YSkraGVhu3MxwM/DCd5/fiCgcMGxtkH+Lj7ThjJDalhD vkRu20QqPvTml/sRxSa6XVlL1QUVV5aegLKZFQ8c0QfnjeophIu3qqFcJPO8fqtMW5/7y3nSDEHB TC+pXP0SFTDwsmt4gPNj68pSApJo++HzvjijD5G+Fzcq1w5e8m8f0oBUVkvW6vUGSrMF18wOe+S1 Cqq2gp5sgHFWdDx+wqAAeHpEEFJder5EqlpJCpChyw4OETcpllEBw8+DBzFhrlanPegw2lRshbXZ ABwIc6zouL1iWmSzSaiTvt/n2d29zQJSWtH4PNWk4xNR7jYN8Cw12rZjYQsrwCwrOm6H+gtg1AA/ wn4rFOolJgH59HCaE6LTs5SSYRgDhka622WtwiYstIAVHdvbsHww8VGt0c09k1vDNArIroRra9Vq nTsVQCIC+RAb5mY3tQpry7emB9YyVnRsL8RPAFGh7qBUaUPWbPxlRdvxbPWHL5c2woylBxZQrY8M RLbDiceizWi3iE5ngPJaBRRXSaG6QQlNci1otfrW2gPfkU2kZwK9nSHE1xkvn1oNROel3w5OCdDT vxemRMCb2y5CvViBA/LdQ2dHxrxnXmFtPtUaycdLo2BynI/Vk9Hw/Wo04LfvNUFqpghu3W2AgnIp iMRKk+3jJeHwQAEM6CWEuEgPGBzhQQBGS22EweiQ3KaPhRrU1xEvJuH908RE+fVI3janqh1DKmsk U6mCgQu+JsNaVuAXd+SPe7DjlwJiEhoVwWd8/JlTS/x9fyKPmEDx0vTesGBCT8LntydWtP2Mp+jH xgbA4eR8dmmFeBLata/VhsSfzGbrdPpXqYKBF/gDvRwtdi2Vaj18fSwPpr57Dj5PyKEMRmciVWhh S2I2TFhzGr48kg1imcamrqw1tmlsrD/xWaXWLcdNRitDEo5lhMkVmhCqnfd1dwAmEzOqkkxdaC5S TV8cyYWruXVgC8GZswMx5lKWCN5fHA0x4e5mGGIbVphS2QMjvABDn5skyqHrt5zFFy2VEqNZXSsd BRZMeAgP4Fvkyv5wthjmbUy1GRhtJauwARZ+dA6+OJxNmytLV6baW+gIowcSLGFWVDWNaFVZGq0u zpLO9uvpSulC8Wky7+28CZsSc6ArBffYdhy/Dau/ukKotM5dWQZpVlicfOzonqN/U4Y3144QS+Ja AUHuZKQlHQ3ydiJ9objh3oJU1MnL5WAwwCOR03+WweaETJDItSbzT7aYzQJGUjy9g5qLiFpdMwYY MiYMFDH2s6SD+JwrssnDr3680zxj/BFLwtm78OGuG4+UFW3PGxkibFm4Gk0AEr/30hN6vcHRks45 ObBIJQ/3/acQDiUXg73IyUslEH802+ZpFiBRoONyWCi45YNCqfHccPBaEJZ6tWiwpR1j4bNKzKQt ymvlyNspINbq2ZN8eywHsosabJJmaQeeiXnJLduBXoRzxDyZfHs4xmYx+1jeLdMXp9Mb4M2v00Cm 1IK9CW7TPj2QTkz9pDPZCSTmJXfc7uHT7K0ib3cAJpWpva3Bw9TFHU8tg5ziRrBXuZpTA/tO3aHV lTWlnoydy9ON15I598cwDISWdkil1hu9A3AFlfj7PbB3SUy+S6xJtxUryKi30IDm4p5UphJibDbL w9LOyFVaoxd68nIF3L7XaPeAVNTKkDtcajNWkAHS39OZ+J/DZrpier35x1sYk6p6pdFaxaGzRdBd 5JP9N2h1Zam6ztX18vtOEtMNQ86Pxauo7pRIOp1gIBKr7Np2dJTqegXqi5g2V5bqbJbbxfUtuzDM mo7ktFFJbS8uLb/O7txcc3Ilq9omrCATp5WLZK2HWQXI3XJJp5PR8AX23U3S7ohswwoS5eV7VU30 AFLToASFSteuYTz2SEmv7naA5JU2gliisirAM+c6GwPvbpn4ASBoh8XPIMQHv6BC0q7hukYVNMrU 3Q6QkioJlFRLrQrwLEmzVNXJiBsBF4PBoMHjEKusb1ahuF3DVchAdkdRIBdeJFbQ5sqSjUMy8h6s CdVq9WJ8dY9VVaLMwvbeCV5v6K7S2KaEzADbr/Ql7G16WetHvd7QgKJ1RoNVxjCvDirqFK2NqTT6 bguIwQasMOVxKdVa+OE/ua27HXnsesyRxym3phNlIjlcyRZ13nC3E/pZYQq8y5mV+ASHNglPXSHG ZDIyre1GSnpV60U481jdFg58kSrdyxVMgXfmcnG79sNCPG5ik8b2voyrL6uyprm1iH7NSAucON0W EBdnrlUBHhXw8MD5j+ul7ZzWyWN7X8TiV43O4HJYYms6IpFr4Nj5EqJhP0/HbgkGvowZT/LRuWjI 1AMS9hzPguLKBw6ukyPn3rrZA0TY/d/dsrZD247mEEGiK7rLevkLuh0geAo82FdgVYBH9kkVODv2 nMjsGNPdao3UWSzMajuCzzq8mNkcoeOPlehu0ruHK3A5THpY0dkxbRyG1IzydtE5LmxmMwYEIHwn 7lU6OvXhrnSCJcP6eXc7QOL6+tiUFW1d61c/T36ofeTyXmkFxEPolIqzxtpONUhUsPdUPsT29kDe FrvbgIHPlh8V40+LK2vuWV8Jp3OJdEk7Z5vBUMb2D7jaCsj7q8eVuPAdMujo3NE/iog0RFykZ7cB JDbCCzxcePQtmzYSXJaLpLD7+MPWAY39mYMfTKptBQR/RQOHw9xOR+cqauWw7ptrMP/J0G4DyCev jKBv2bSJRa9bDl6DzIKHn2fG5TJ3t7K1ZSM4wO1Mbb1cQ+XJDcbkXHolpN2p7RZgDOvnC32ChdY/ PNPMmvzUjDJCXXUUJsaQjR7aMyX3SPPn1nrI6a9mVyCWZNLVUTw2sXdx4LBg8ZRIcrNDrABDLFHC a5t/w+fvPnwNDuyr370xrrVC1a5AJXTl7YS/kDyPwJg+KtSqAM/cUnC93gAbd16CkjZVwTaiR5pp W9sd7Vfhbpr9vQOXVfRXAMOVz4XX58fQ4sqaYtFmZDd++DW702tgs5hZF76f/7NRQIYFuhg4bObB xx0M3CU/uHEyAsW2j609c6UI4g/fMLr8wsmRc6Djvodq6hw26yAdMYm9Cr7Yct3iwRDbx5uSK0vG VrTdn1NYC4s/PImvRe/0OpAxl/t584+YBSTvpxV5Xu7OnzyugMS/NQ5WzOhH2WhTeWzI2T+LYeba Y50a8Rbx8nR+K3XHglKzgBCpFD53O4peGx4nIPCywDdvj4fpo0MtMtpkVdqVrAp49fOzUCc2PreA xcKqvN35hzplTqfBXcYxeUjcHK1Eppr4OIDR098Ftq8dDxOHBpFPHlrwiPNLt8ph+hs/gVxh0uXX 9+whXInYca2zL43WW8/k1mCLVh9O02h10d0ZjFVzY+A19OeEDLkt38CwccdF+O5oOqi1ps0vz4F9 qfzXf44w9r3RiXJ4OsVD6PgxE8O65AWLdJbiccONq6aTX82E95bEgbMjh3aj3bJdUy+H5R/9iryp 62bBYDKxWheBw3qTx5j6UpR9IsczappQrdYNtyUYvu5OcHrrDKKmUteoBJnSsigfjy0mDw+B/Rsm wYJ/RICvhxNtRrszMC6kl8KMN3+CtNvkZmq6Chw+zktaccDkjWnuJJPfSOKnZ1VkqzXE61FtJoMj veGr10dBgLczchnrIe1ODVzKrILkayXtZma0S8pxmDAy2h/GxAYQ9Yw+we6dvuWN7lfsNUpVsOLj X+H3a/dIL/FGhvz23KnRA+JXj1ZZBQguAxfvjy0tb/xDZ8VaEjLi4eIAs8b2gpWz+rcu8yKY2qAA sUxNrAtsUUn4hARPN8eHHstEVzKwMyDwKZ97TtyCnccyCFVFVjCMUTegr9/w5G1z8syqbrIn7fH0 N+ulMvWHYOUEbTKC3/kbVwyFKSNCQChwMDtQtABh4jf4wtBfLxbCO/Ep0CBRUrSNDI27m+NapKq2 kjqedEB1Mpv95Y7U7eImxYtdmY1d9FQErEZeEg4Mg+QDXuhihQy5r9uPpBGMaJkQTdmuufA2fffZ zHfIvhackm+D25ObOZWnlCrtyK5Od0SHeRJxBG4zokI9LLYVpo7HZ/PnFtXC+bQySLlRQtQwkO20 +LodeezjE0aGzd/33kTSM9ApO5sz3j0ecOVGSQq60EdSEsQTg3geKqa3F/EXESyEQG8B8TBOqmDg 03Gq6+QECOnIibiZVwOXM8stZkO7nCCHmRUd6Tv69Jez6ympOEsam7Dqx4ic/OokxJRIsAPB2RLk KyAeURHWQwjhPdzACxn8B7MoGYSnhk8uyCuph6KKRmIaDl6jUKjon62PmJER2y9g5vH/m15MOR6z tNHIZ3eHIXvys72AYi+CwEgLDhDOSt0x36JF+hZ7TDlHluVHhnk/w2Ez8/+G4YGaGhQdMNNSMKwC BJfk+Dm3hwzsMYbHY6f8zQz2cdxm/PzZ9BJrzmP1y4nvpCZKYiYsTtJodAKVWjvIGjXYHQWPM9xc eFvGDgtdlfTJVKvzfrQNHv6Sko+3/v5WY5PiPeQ+8v8KYOAROAr6/if+o+nbycYZXQZIi4x55XBU bkHNz5pH5BZ3oWeX2z/Sd9rZrXMKaAWZ7gtN+WZu1sjBwf1c+A6fMZlY/eMGBOpTDVJR7z07NTqG bjBswpAOrvHkRolyvUKpGdwVOTAbix4Z7guuAt6GrMSlNnNiusQAD3r+wLKScvEnWp3eu1uyAmNU 9O7ltTr1+/lJNncSuqpT/1iT5FZWKZ5XI5Ju1ln40M2uFhYLaxK6Oq4eGOX/06H1k7umctrVnRz5 4qHAKpF0nkSqWqjR6qLsUJXhKwFuODpwfkBxReKxz6Z16YNbHmnM8MRLCTOKyxpWK1XaOB2Ft4na hA1MTMLlsi5EhHp+eXbbnORHFtfYwy358he/C85dKRyjVuueb5IoJ3WVSkMgSAV8h+NcDvPQpDG9 z29ZOVL6yANNe9PbKzYlC89fKRqMWDNMp9f3RZFwf7lCE2QwGLhWBnFKngO7CJ3nFgIii8NhXX7y ibBrX78+psmuIv/uYFx3/ZYnOHMuf9Afl+4OEjhzQ1RqradCqXVBd7YAxQU8BBoBll6vV+r1BiUC U4xc1CYOmyWSyFTFU8ZHXJo0Jjxt3rBgub339b8CDABGIcc4IY8t+AAAAABJRU5ErkJggg==") + + symbol#svg_allennlp(viewBox="0 0 124 22") + path(d="M19.3,0.4h3.8v16c0,0.6,0.1,1,0.4,1.3c0.3,0.3,0.6,0.5,1.1,0.5c0.2,0,0.5,0,0.8-0.1c0.3-0.1,0.5-0.2,0.8-0.3l0.5,2.9 c-0.5,0.2-1.1,0.4-1.8,0.6c-0.7,0.1-1.3,0.2-1.9,0.2c-1.2,0-2.1-0.3-2.8-1c-0.7-0.6-1-1.5-1-2.7V0.4z") + path(d="M27.6,0.4h3.8v16c0,0.6,0.1,1,0.4,1.3c0.3,0.3,0.6,0.5,1.1,0.5c0.2,0,0.5,0,0.8-0.1c0.3-0.1,0.5-0.2,0.8-0.3l0.5,2.9 c-0.5,0.2-1.1,0.4-1.8,0.6c-0.7,0.1-1.3,0.2-1.9,0.2c-1.2,0-2.1-0.3-2.8-1c-0.7-0.6-1-1.5-1-2.7V0.4z") + path(d="M42.9,21.6c-1.2,0-2.3-0.2-3.3-0.6c-1-0.4-1.8-1-2.5-1.7c-0.7-0.7-1.2-1.5-1.6-2.5c-0.4-0.9-0.6-1.9-0.6-2.9 c0-1.1,0.2-2.1,0.5-3c0.4-0.9,0.9-1.8,1.6-2.5c0.7-0.7,1.5-1.3,2.5-1.7c1-0.4,2.1-0.6,3.3-0.6c1.2,0,2.3,0.2,3.3,0.6 c1,0.4,1.8,1,2.5,1.7c0.7,0.7,1.2,1.5,1.5,2.5c0.4,0.9,0.5,1.9,0.5,2.9c0,0.2,0,0.5,0,0.7c0,0.2,0,0.4-0.1,0.6H39.1 c0.1,0.6,0.2,1.1,0.4,1.6c0.2,0.5,0.5,0.8,0.9,1.2c0.4,0.3,0.8,0.6,1.2,0.7c0.5,0.2,0.9,0.3,1.4,0.3c0.8,0,1.5-0.2,2.2-0.6 c0.7-0.4,1.1-0.9,1.4-1.5l3.3,0.9c-0.6,1.1-1.4,2.1-2.6,2.8C46,21.2,44.6,21.6,42.9,21.6z M46.8,12.5c-0.1-1.1-0.5-2-1.2-2.7 c-0.7-0.7-1.6-1-2.7-1c-0.5,0-1,0.1-1.4,0.3c-0.4,0.2-0.8,0.4-1.2,0.8c-0.3,0.3-0.6,0.7-0.8,1.2c-0.2,0.5-0.3,1-0.4,1.5H46.8z") + path(d="M66.3,21.3h-3.8v-8.4c0-1.2-0.2-2.1-0.6-2.6c-0.4-0.6-1-0.8-1.7-0.8c-0.4,0-0.8,0.1-1.2,0.2c-0.4,0.2-0.8,0.4-1.1,0.6 c-0.4,0.3-0.7,0.6-1,1c-0.3,0.4-0.5,0.8-0.6,1.3v8.7h-3.8v-15h3.5v2.8c0.6-1,1.4-1.7,2.4-2.2c1-0.5,2.2-0.8,3.5-0.8 c0.9,0,1.7,0.2,2.3,0.5c0.6,0.3,1,0.8,1.4,1.3c0.3,0.6,0.5,1.2,0.7,1.9c0.1,0.7,0.2,1.4,0.2,2.1V21.3z") + path(d="M72,2.9v18.4h-1V1h0.7l14.9,18.7V1h1v20.3h-1L72,2.9z") + path(d="M92.4,21.3V1h1v19.4h12.4v0.9H92.4z") + path(d="M109.9,21.3V1h8.3c0.8,0,1.6,0.2,2.3,0.5c0.7,0.4,1.3,0.8,1.8,1.4c0.5,0.6,0.9,1.2,1.2,2c0.3,0.7,0.4,1.5,0.4,2.3 c0,0.8-0.1,1.6-0.4,2.3c-0.3,0.8-0.7,1.4-1.2,2c-0.5,0.6-1.1,1-1.8,1.4c-0.7,0.4-1.4,0.5-2.3,0.5h-7.5v7.8H109.9z M110.9,12.5h7.5 c0.7,0,1.4-0.2,1.9-0.5c0.6-0.3,1.1-0.7,1.5-1.2c0.4-0.5,0.7-1.1,0.9-1.7c0.2-0.6,0.3-1.3,0.3-2c0-0.7-0.1-1.3-0.4-2 c-0.3-0.6-0.6-1.2-1-1.7c-0.4-0.5-0.9-0.9-1.5-1.2c-0.6-0.3-1.2-0.4-1.9-0.4h-7.3V12.5z") + path(d="M18.4,4.6c-1.7,0.9-3.3,2-4.9,3.1c0,0,0,0,0,0v0c0,0,0,0,0,0L11,0.4H7.3L0.2,20.8L0,21.5h4.1c0,0,0,0,0.1-0.1 c2-3.2,4.4-6.1,7-8.8l2.9,8.9h4.1L14.2,9.8c1.3-1.2,2.7-2.3,4.2-3.3V4.6z M6.8,13.2l2.3-7.1l1.3,3.9C9.2,11,8,12,6.8,13.2z") + + + //- spaCy users + + symbol#svg_airbnb(viewBox="0 0 320 100") + path(fill="#FF5A5F" d="M168.7 25.1c0 3.6-2.9 6.5-6.5 6.5s-6.5-2.9-6.5-6.5 2.8-6.5 6.5-6.5c3.7.1 6.5 3 6.5 6.5zm-26.8 13.1v1.6s-3.1-4-9.7-4c-10.9 0-19.4 8.3-19.4 19.8 0 11.4 8.4 19.8 19.4 19.8 6.7 0 9.7-4.1 9.7-4.1V73c0 .8.6 1.4 1.4 1.4h8.1V36.8h-8.1c-.8 0-1.4.7-1.4 1.4zm0 24.1c-1.5 2.2-4.5 4.1-8.1 4.1-6.4 0-11.3-4-11.3-10.8s4.9-10.8 11.3-10.8c3.5 0 6.7 2 8.1 4.1v13.4zm15.5-25.5h9.6v37.6h-9.6V36.8zm143.4-1c-6.6 0-9.7 4-9.7 4V18.7h-9.6v55.7h8.1c.8 0 1.4-.7 1.4-1.4v-1.7s3.1 4.1 9.7 4.1c10.9 0 19.4-8.4 19.4-19.8s-8.5-19.8-19.3-19.8zm-1.6 30.5c-3.7 0-6.6-1.9-8.1-4.1V48.8c1.5-2 4.7-4.1 8.1-4.1 6.4 0 11.3 4 11.3 10.8s-4.9 10.8-11.3 10.8zm-22.7-14.2v22.4h-9.6V53.2c0-6.2-2-8.7-7.4-8.7-2.9 0-5.9 1.5-7.8 3.7v26.2h-9.6V36.8h7.6c.8 0 1.4.7 1.4 1.4v1.6c2.8-2.9 6.5-4 10.2-4 4.2 0 7.7 1.2 10.5 3.6 3.4 2.8 4.7 6.4 4.7 12.7zm-57.7-16.3c-6.6 0-9.7 4-9.7 4V18.7h-9.6v55.7h8.1c.8 0 1.4-.7 1.4-1.4v-1.7s3.1 4.1 9.7 4.1c10.9 0 19.4-8.4 19.4-19.8.1-11.4-8.4-19.8-19.3-19.8zm-1.6 30.5c-3.7 0-6.6-1.9-8.1-4.1V48.8c1.5-2 4.7-4.1 8.1-4.1 6.4 0 11.3 4 11.3 10.8s-4.9 10.8-11.3 10.8zm-26-30.5c2.9 0 4.4.5 4.4.5v8.9s-8-2.7-13 3v26.3H173V36.8h8.1c.8 0 1.4.7 1.4 1.4v1.6c1.8-2.1 5.7-4 8.7-4zM91.5 71c-.5-1.2-1-2.5-1.5-3.6-.8-1.8-1.6-3.5-2.3-5.1l-.1-.1C80.7 47.2 73.3 32 65.5 17l-.3-.6c-.8-1.5-1.6-3.1-2.4-4.7-1-1.8-2-3.7-3.6-5.5C56 2.2 51.4 0 46.5 0c-5 0-9.5 2.2-12.8 6-1.5 1.8-2.6 3.7-3.6 5.5-.8 1.6-1.6 3.2-2.4 4.7l-.3.6C19.7 31.8 12.2 47 5.3 62l-.1.2c-.7 1.6-1.5 3.3-2.3 5.1-.5 1.1-1 2.3-1.5 3.6C.1 74.6-.3 78.1.2 81.7c1.1 7.5 6.1 13.8 13 16.6 2.6 1.1 5.3 1.6 8.1 1.6.8 0 1.8-.1 2.6-.2 3.3-.4 6.7-1.5 10-3.4 4.1-2.3 8-5.6 12.4-10.4 4.4 4.8 8.4 8.1 12.4 10.4 3.3 1.9 6.7 3 10 3.4.8.1 1.8.2 2.6.2 2.8 0 5.6-.5 8.1-1.6 7-2.8 11.9-9.2 13-16.6.8-3.5.4-7-.9-10.7zm-45.1 5.2C41 69.4 37.5 63 36.3 57.6c-.5-2.3-.6-4.3-.3-6.1.2-1.6.8-3 1.6-4.2 1.9-2.7 5.1-4.4 8.8-4.4 3.7 0 7 1.6 8.8 4.4.8 1.2 1.4 2.6 1.6 4.2.3 1.8.2 3.9-.3 6.1-1.2 5.3-4.7 11.7-10.1 18.6zm39.9 4.7c-.7 5.2-4.2 9.7-9.1 11.7-2.4 1-5 1.3-7.6 1-2.5-.3-5-1.1-7.6-2.6-3.6-2-7.2-5.1-11.4-9.7 6.6-8.1 10.6-15.5 12.1-22.1.7-3.1.8-5.9.5-8.5-.4-2.5-1.3-4.8-2.7-6.8-3.1-4.5-8.3-7.1-14.1-7.1s-11 2.7-14.1 7.1c-1.4 2-2.3 4.3-2.7 6.8-.4 2.6-.3 5.5.5 8.5 1.5 6.6 5.6 14.1 12.1 22.2-4.1 4.6-7.8 7.7-11.4 9.7-2.6 1.5-5.1 2.3-7.6 2.6-2.7.3-5.3-.1-7.6-1-4.9-2-8.4-6.5-9.1-11.7-.3-2.5-.1-5 .9-7.8.3-1 .8-2 1.3-3.2.7-1.6 1.5-3.3 2.3-5l.1-.2c6.9-14.9 14.3-30.1 22-44.9l.3-.6c.8-1.5 1.6-3.1 2.4-4.6.8-1.6 1.7-3.1 2.8-4.4 2.1-2.4 4.9-3.7 8-3.7 3.1 0 5.9 1.3 8 3.7 1.1 1.3 2 2.8 2.8 4.4.8 1.5 1.6 3.1 2.4 4.6l.3.6c7.6 14.9 15 30.1 21.9 45v.1c.8 1.6 1.5 3.4 2.3 5 .5 1.2 1 2.2 1.3 3.2.8 2.6 1.1 5.1.7 7.7z") + + symbol#svg_retriever(viewBox="0 0 150 33") + image(overflow="visible" width="150" height="33" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAJYAAAAhCAIAAAC0rMV4AAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJ bWFnZVJlYWR5ccllPAAAE6xJREFUeNrsW1lwHNd17dfb9Oz7DAY7QBAgSJEUSYkUKcVarGiJLDuL EpWjxM5HqvKTisqpyo/1EX3kx6lKVcpJXEnZKadkKbGqbCk2QyuyrC2UZZqgCIIkSIDYMcDsW/dM T+/9crsbGAIgQIIUFTFVfGxs08u77y7nnntfE/3B9z/wkChIkx6a8nK0x0URFINJFHDRnQGuLeDp jwe9LoaiSOLuuCMHOn5mRtH0qqzxklaQ9bJm5FUt3dQ43YxTRLuHPdYXO9QbH05F7lrxDjUhxhh+ GCYMbJjYxFjVjGpTGc9UToxnck1VJ4iIh/m9fd1PDKY4F3NXZXfaoJ0fFAlj5Q83ywS9XDzgJjB+ 88KypJuCrJ+4nEl5Xff3J++q7E4bV7ERtQ74IjDkv0eHOx8bSCw3VT9F5nnpwxmAWN06Zwfu3XHH mXAdvFpmRB4Xc7AnzhGEZmIGoYIg803lrsr+f5jQiUowpN/NhFlSMYGiEqphyrp5V2V3aC7capiY sEETsBORCJEk2upKvIrG1/6Otn7+9c9ef+A1KeCWB8YbH4HWy397B16vopXpsJO8bu4h+MYmtIyH RUUVNDPopgFL3QzlZanra9OitZZWMEQwdR2D3w7tO55lGCbMA3Pdsgc4YsM/G5SQxevQZxIuV7UE YpsOnq1o6RacRtcNuBWkpbdWLtYMcyJTFU0iTiLZNDvC3iDHrmbKdbLpupmrNhaKwlSxPi/IYPuQ i+4Jefqivh1tYeC3LY3DvZKiTS6XS3XJKTTRGmvCahqK1h0LDLVHGNpyF1XXR6ZzsmrQlDUr1Dg9 Md/O9hhFoXxVHJ0vzFQaXoY+0pfY1RGFJcEU/zO+qBiYsR6OwTYcQx4d6trgTo4kiqbNF/ilSuNS TijJGigy4mYHot7+mH/gqgDGVLa6WBICHAuq100z6nPv6YptVSXLqn5pqVyuSy6aSoW9PfEAy9DO dFC3lYUmaGkyL8zzkmwYPobqC3s6gp57uuNXtWTrAhb+yUyuIeswEdgCXCzgYe/pSXAMXatLI7M5 UDVLUfd2RejWkjb6C8bLlfovF8pwk4aJdj/3QE8EUZQjzVpdiLJ2ejb3xnjmiiDHaPJI0odIarIi nlis+kj0ZF/sD+/vD/ncrVsERfvOxeWTi9VBL+tCBHhj3cAF3ZRMHGbITEH8i/u6diSCjgZZmrqU r/1gqhSkyABFXhTkF/e17UiGp7P86+cXLxcbhG7KJj5TaLz0Ra497APZGJJ45Vy6pJluEi1q5n4f fWSwgySotTKDlgq1xkdTuZ9eyc9I6gEPMxDxQtaYLAqvzpYOBLgnBuJf2t/DgfpJVG3K3xtLZzXc z9I5VesJe/82wEUjgU2zQLkmfvvswicF4fe7gr+7t5u0dWVpDJunp3NvT2TfKzTiJD4c94XdzDIv vbLMGwTx5FL1d/Z19SRCaFW5YLjlmnT8Sl7QTReJeM24L+kH35Xl5r+emhpZFgjDAI2NFgTaWZKJ 17moqGrTef610zOzvESSVNDnevHYwFBH1EHItRLn+Oa/nJz4aKES4pinuyMvHN6RCHkt59X0dy4t /fjc4uuTuY+LwktHdw53xZx7Y17ub35j6KOZ3M+v5OuSDmXKb/dHnxhKuVga3FzTzYDHxbJ0K1a+ emQQQO6D+bIL4xDGfhczU+C/dXIqJ0hhhmIg8hBKC1JVlNtCXrj+2O6epom+e3oOvKYdylxL6HXe CRacyFX+7oPLaV6KuZg/HWr74weHSBuK603lxNj8v51NvzamTFTEr9/X3xX1PzjYkQp6f3hm7myW T9Jkoya+P5V/7kgAbZbLR7O1XLH+pVTwzx7clYz4ndgA2V4dmTk+lcea+WR3+GtHd3ZFA444H15e +smF5fdmS2eXqn/+8NChnoQddmBC6isH+2iG+snFZWwpguAQytfqPzy/NJKuhmgEDJMhybKo0gA1 laacrojMKtQAdID9Ti2UJ6pSkKP3JwPPDrfb9tsI14AMY+nyyflS2O3qDHBHdyQs+9lJFNz3UHcU zlayPF+T3p4p7EgGWdZq7oCIiaA3HHCbCAEuSQjB5x1RP+diN6ttCMjBvXG/sVCRdMvTQM73r+Qk w3i4M5wVlUJDAZ7cE/VGfG643rE6YL6bQk3d1Fdy8zoH1Qzj/cnlpZoc5SD4PI/t7gD7Obr2e1zH dqbem8wtycbFDP/xXPH5qB8eCNh+sCf2SaYGEC1pxpkc/2VFZa8RuCmr0xVRkLXBhD8Z8rQ+Pz1b eGu6qOu428N8YTDVGfa3HPTYjrZL+XqxqZYk9ePZYm/Unwh4CHshDEP3Rf1wkaqbdl1HfDRdmMgL h1NBSdMnyw1Ax64AZzl+plL/3skreVn3gOmhCrSMTtyTDDw1nEoGPINtQfc1sjrely43PpzOUzTt YaldbcE9bUHCdnlkWzrqdfcng2fzddIwMxVxplgf7oisqtE0DTA0tpsJVlfPNPEKP0Qb/RqwCCRw UaSkawmOPrdcM3TzG0f693fHeVEeXa4aprE3FU4GPa0b3SzDUqQIJRDahH8uVervTxchWwOGP7gj 0Rb02pwQOaaOeF27U8HzVwpempwpCKKkeuFxNNUX83eHPQuVJvy+LEjjy5UD/W0bEux8SRjP831B 7lBHmLB0aT1WlhXwcqQZEYqEh/SATzjRYjsowzIHU8GPFsqwwEuF+mS2FvVxNKR6+6zfRcP6QVU0 SRZEeVlUvjiQeO5AL8x1ejYvauZQMkBD1PbGgs/s64JpxrK8345iliRZmtzfGUmEfHgLJglUcKFS h7QMaAZaTvrdHtvSqGVlkvCytJsiIVfVFS0jNIfagiRFrUhPota1CK0E3KbcGk6RdooAHQMl/mCJ f2Fv+57OGCjW7WIeD3pgKidxrrG6deDNyyQ8vlguKEaXhwLG0R7y0tQK/qwCI3gMQ1tKIOpW998y IZyFBe5OBjK8DFdomnkuJxzoS66VWNb06aKQ5aXDnaH+eKD1+UK5XmrIwKdYGqX8nJ9jiHXTgfAW 2QJUVFS9IiqAHK3VkPbaTcJqrSwIyuHOMGQcP8fCp4/u7gS3B+Fpu35nn9zTdaQ38e7lpddGFz00 bRD41FK11Jz4+uF+YHobKIwzFM2AKKxrRoC0pvG4aEXTL8zml2tN8JGqpJQkfbbWFFRDMkyhqS7U IfoNF0XdUjd+xdKyYboRcaQ/6eNYG7ARvZ1yAl9loapmjKZLHpqEEGQoBAptqtqH42nAW0U3apKW a2hXqg03TZYVwxTVYkNqD3th7oCbubczejEnZHgJHjZbbhSEJmSE1iRFQYIshbD5QG/c523RNzxZ qANNBUoCoe51MQCRp69k8oJkYJOX9LKkLfJNXgaUJeqaulyXQAyOoVbXbMMUBAzGkkkc7o2lQj5k AxVUI5Tt3/SKmxMIcslz9w/0R7zf/O/xmJcFUL1YrP/Tycl/eP4BOH9t3lYNo9CQwEMQhRwGBboA kgJAB/QVW26FUhzV6fECUAKva3fRIKVrAxbfTCkErlJVjb0Rd7tFW1oBj26qMpNUPS3IgA14NV4h 1P5zbNHF0C6GBPxiKTTgYwf9LogGn4tmbM4MaqQoajgV3pMsz1VEP0MV6tJEnk8E3FBJrkRbrXmh KHSGvcNOQnE8RjUyogoZlLJMiEm7tHhnKjtXEIJulrT4Mxmi0OGE12EhSTeN8CYtNCgj/RzdHfGR lENyr5Z2dAvKnCUe6E+9+JDy/TMLBsIhhlpsqMdH55890GfD2Dqgg+wFeLKSQawtKh0Kpm88dbBV n+BWg2BVWbdQgG/owoua0RXx2U56iwPAA1IIY0sCosKfQFW+87WHV/ITJvDVns0KuW8pCyYGhHRN FyA5CbI+tlTZnQzFghZtAYCZLAhVSXt8IJkMuNcmSNPuGth7BzC7TjP0S88cskjWBgWtZg1qQ0fE 1jsmzHYP63e7rm2XkNdkHQIo2dGeiGo1LAgoxX46mR2Zzm7q65hYIew6xg3F0A1s2wk5B20dJIA1 Yx+fxn5rSzov51T5tz6cCgqkkQ0MlsB2RoFnUmhVbIpsHWszCPw+nAz2hz0lFe7CF3L8laKwEoIl 4XymGuSYPW0hlmHWRomJV4ibYUK1pmuG1Zdx5qJWVdQ6qGtMZFEt6wGE13p1gtxGmxshSDMP9cWB rarW5AQvaVASyKq2QW+kvVSHQALlrckq4MBnvkNtLf5TPQGciaNJw9IpkkzIRpDxzO1vniUC3M64 HxIqyFJqqvOVuhVnhmkRGUEaiHj7wt4NUWG10OwBk/KyJmu3qCVyi5xBXqsjMMy+rtjjg21NE+oq EyrnqWLj17MFjNdtU4D7WD0nmzUZBi7VpZqk3uG7ibA6yHkxNw1RBO4HAZGuNMAeiCDw9lIpRNgD PbGBANcwDIivyXy9IEjVpgJlNJStR7qjUIKtLWCgOgi6gDTaDUsTL9oXE/iW8sgW1GFzcHOxzIHu 6GDIA47DkAQs8ldzxUylsdZCQIVjAY60szRYFyhWhm8SW1UGmwqFCeIzM/dWD4ZCojfsBe+0nJVA s6WG43nENjzPWVh31L8rFRQVA0AvW5eytUZeaI6XGyEftzcZ4lhm7dVQRHVAEUCRUKNCOFabKhy3 d9nkpoICIHeE/X90Xy/NMQ7M/Gq59qOxRU3THYjHljtTg4lAp5dt6iZlVy0j6TKAKbppAdFt39jB q/RhA1Ow0h5NHtvZBssGwgmc+VShPjJXJG5G5qCXe3BHIu4igWALknpqpjAyX1xoKH3J4GAyQKy2 CFpL2tse9ntdDd0IsFRd0c8sFKu3deec3JrBoz2d0d/qixVlHZbKWvWNMFcSVuiyLSSQw11xX1nR OXAwTEzlhfmycFMeTdip2n756gY2uVl4vloFrbOiZcgdbeHhsFvUgZciN0KfpMumbpAIbT8FdIV9 wO9V3SQJ8mKOP7NY7nUzh2I+2g5BtB4JIkEf5EibmkKhRZ3P1BbX49n1UMTeuXNKQ7wFaF1v197F MI/u6mj3smXV4Cgq31BOXFwisNniWgEP99DOVNBFFVXdx1KLvPTGucVqvXltH2DD1EB8AywN5TO2 2DJRqsuiql2HhZqry7WL3BuHLF41OQiiY7RK31fO+dzsE3s6FIybhumjqctl8d/PTAuijK6xIt6S 1Hju7QgDsdIwLst6uqEOxvwH20Ob5goYjw4ku0KeuqYHWaoo6v8xMjuRqWyYYFMDmWtqDmf7Hd+E Ce1qJBZwP9gXMw2roUkjNJrhz80V1obZQDzw9FAKilhrT4QiR7P8K6en5wo1bQ071Q1TlNWmoq1R EGoPeDwcIxoYKp2FSnMyX9MNw6HgGyISbhIVix0DAaHtBoJ5o3CRNV23+wkQZ5JuOE9eSxEPdscO d4RkE+vY5BB6d6r45thCtlLXVq/EdqEtSvBtE3xws3RvzB/gGM16JQUB3R9a39feMPoTgaeGkmE3 09AM8PgrxfqPzy2Mp8tNRb2qE3ApSWnK6lo3aqo6uDll7QwTTX1z7ky9/PLL1zMisnZBy6I0WW0G GRokuFxu/OZAkmYo53UBlrYyYpyjJvNCQdZgPTlBOZeuzhf5Ci+WeXF8qfzG+fTx6YLfzfSFvC2y 43ez4PijOV4xrCibKTcWCsJSURiZyfk9LJTvLSGWysJb48uzVeBKyCAQ3LU7GYj63Fu9FADme+tC +nRWoOyCJ68Yu8JcImj1QlvB4WHpobjfjfBYnq9pJkUQs5XmaLqyWOSFhpSv1k/PFl8fXRzPVfem gKHQm7YY58v1YkOuakbCz311f3c86NsqXzA01R8PhFh6mRchZMGHSqJyLludztaqgljiG1PZys8m Mj+4sAh0tifqd7byi7z4zmRmotAA34cYqqr6rrAbFr6hIXyDBGBvWhAXl8rf+sVFUcdeitQw8cK+ jq/cP7ChdwzzjUzn3p7MXq7JJQPPqGZD1eH+Pi/7fMr/2FDq6FC7z+3acNdMpvKjs3MjGb6kmXOq sSTpjwSYf/zywT09ceeVEkPT//q/zkKYBly0k9/gG1CDv3xkeG9PnNoEsfG3f/bJqWw9CkTMXhrE ImDC8/d0PnNoYANIgussFmsfT2WPT+QLqlHRzXnVqCmQF4k9Ie5PukIPDaXu39mONuPYiqr/4nL6 zQvLC9Xm40Ntf/XYbpKmb/gqkCgrI1PZE5cylyrNso7TmgEeZhhmn4d5LOp5ZlfbA4Pt4G3AdaEQ //t3L45l+RBDObsAYEXAhKd3p5492O+iqZZU28jhGAMPfvXX0+/N5D0UBXr1+dhvPrKrNxFqvbjj OCn8JSlqqd4EngY1JElaRVjY6wr63Js2o1feMNB0oSnXZWC7Joge8bmDPo5stTZMDKhoy4jXEEsM 0b/BGa/uVyvaekVaKYlCJMfSm76vBU/jRbkqyg1ZczpqbhcT9XGAEwiR10m3AGwgNSQZF+R1lt4+ wwLzl+vNmqjYHQar2xDxu4MeqEfWvlpASJp2dS8erXwIAcox9Fqn2hYNM0wM6e3VM7NjGR7wCNjq sZ7o8wf7okEPXk/ANq0o0I1eesO374Wo21U+blOSm31tadMlf8pVb6tvCcjcGfEf6Y5h+z1E8L6x TO3UXAE4C/oUcqD/c4N97jJ8FtPdRDFUqkv//OHEz+dK/X62aRAcQw7HfPd1Rr6wq93F3v3vMp/b ILcPAWGP69GdSR9JKAaUqNau/S+Xqt+9sCw05Lt6/BzH/wowALWbjrMDkCkwAAAAAElFTkSuQmCC") + + symbol#svg_quora(viewBox="0 0 201 56") + path(fill="#b92b27" d="M29 43.62c-1.93-3.77-4.18-7.6-8.57-7.6-.84 0-1.68.15-2.45.5l-1.5-2.98c1.83-1.56 4.77-2.8 8.54-2.8 5.87 0 8.88 2.83 11.27 6.44 1.42-3.08 2.1-7.24 2.1-12.4 0-12.88-4.04-19.5-13.45-19.5-9.27 0-13.28 6.62-13.28 19.5 0 12.82 4 19.36 13.28 19.36 1.47 0 2.8-.16 4.04-.52zm2.3 4.5c-2.05.54-4.2.85-6.35.85C12.6 48.96.5 39.1.5 24.76.5 10.32 12.6.48 24.96.48c12.56 0 24.53 9.77 24.53 24.3 0 8.1-3.77 14.67-9.26 18.9 1.78 2.67 3.6 4.43 6.14 4.43 2.77 0 3.9-2.14 4.08-3.82h3.6c.22 2.24-.9 11.53-11 11.53-6.1 0-9.33-3.53-11.76-7.68zm26.12-12.3V19.27c0-1.9-.7-2.73-2.86-2.73h-2.3v-4.4H67.3v23.5c0 3.95 2.15 5.7 5.4 5.7 2.7 0 5.37-1.2 6.8-3.9V19.26c0-1.9-.7-2.73-2.85-2.73h-2.45v-4.4h15.2v24.6c0 2.45.92 3.57 3.72 3.57h.5v4.54L80 47v-4.67h-.28c-2.63 3.2-6.34 5.38-11.62 5.38-5.95 0-10.7-3-10.7-11.87m56 7.48c5.36 0 7.4-4.66 7.5-14.04.1-9.2-2.14-13.63-7.5-13.63-4.68 0-7.62 4.45-7.62 13.63 0 9.38 2.9 14.04 7.62 14.04zm0 4.4c-9.7 0-18.43-7.4-18.43-18.44 0-10.84 8.52-18.04 18.42-18.04 10.32 0 18.6 7.34 18.6 18.04 0 11.04-8.28 18.45-18.6 18.45zm18.9-.7v-4.4h1.47c3.62 0 3.97-1.04 3.97-4.2V19.27c0-1.9-.98-2.72-3.2-2.72h-1.97v-4.4h13.82l.7 7.2h.27c1.53-5.18 5.66-7.9 9.52-7.9 3.2 0 5.7 1.8 5.7 5.5 0 2.55-1.25 5.28-4.7 5.28-3.1 0-3.7-2.1-6.26-2.1-2.3 0-4.06 2.17-4.06 5.36V38.4c0 3.16.77 4.2 4.34 4.2h2.02V47h-21.64m46-5.12c4.4 0 6.2-4.17 6.2-8.36v-5.6c-3.2 3.34-10.68 3.46-10.68 9.4 0 2.9 1.72 4.56 4.47 4.56zm6.42-.02c-1.82 3.5-5.55 5.85-10.76 5.85-6.06 0-9.97-3.2-9.97-8.87 0-11.4 15.87-8.36 20.53-15.9v-.83c0-5.8-2.28-6.7-4.8-6.7-7.06 0-3.84 7.6-10.34 7.6-3.14 0-4.35-1.9-4.35-4.02 0-4.3 5.13-7.76 14.75-7.76 9.1 0 14.7 2.52 14.7 11.58v14.47c0 2.24.82 3.45 2.77 3.45.84 0 1.54-.23 2.08-.6l1.16 2.83c-.94 1.47-3.48 4.06-8.3 4.06-4.2 0-6.83-1.95-7.18-5.14h-.28z") + + symbol#svg_stitchfix(viewBox="0 0 224.6 26.6") + path(fill="#2F3237" d="M9.7 11.7l-.3-.1c-4-1.1-5.9-1.9-5.9-5.1 0-2.7 1.6-4.2 4.3-4.2 0 0 5.7 0 7.3 4.7l.1.2H17V0h-1.8l-.7 2.6C12.8 1 10.6 0 8.1 0c-4.7 0-8 3-8 7.3 0 4.8 4 6.2 8.5 7.3 4.5 1.1 6.4 2 6.4 5.3 0 2.9-2.4 4.3-4.5 4.3-3.8 0-6.5-1.5-8.6-4.6l-.1-.2H0v7.2h1.8l.8-3.1c2.2 2.1 4.6 3.1 7.6 3.1 4.8 0 8.2-3.3 8.2-7.7 0-4.9-4-6-8.7-7.2 M22.1 5.5h1.7l.1-.5c.6-2.9.7-2.9 2.7-2.9h4.6v22.3h-2.5v2.1h8.7v-2.1H35V2.1h4.5c2.1 0 2.3 0 2.9 2.9l.1.5h1.7V0H22.1v5.5z M63.2 5.5h1.7L65 5c.6-2.9.7-2.9 2.7-2.9h4.6v22.3h-2.4v2.1h8.7v-2.1h-2.4V2.1h4.5c2.1 0 2.3 0 2.9 2.9l.1.5h1.7V0H63.2v5.5zM49.3 0v2.1h2.5v22.4h-2.5v2.1H58v-2.1h-2.4V2.1H58V0zM132.4 0v2.1h2.5V13h-12.4V2.1h2.4V0h-8.7v2.1h2.6v22.4h-2.5v2.1h8.7v-2.1h-2.5v-9.4h12.4v9.4h-2.5v2.1h8.7v-2.1h-2.4V2.1h2.4V0zM186.4 0v2.1h2.5v22.4h-2.5v2.1h8.7v-2.1h-2.4V2.1h2.4V0zM215.9 0v2.1h2l-5.6 8.1-5.4-8.1h2V0h-8.7v2.1h2.5l7.6 11.3-7.5 11.1h-2.7v2.1h8.7v-2.1H206l5.8-8.7 5.8 8.7h-2.2v2.1h8.7v-2.1h-2.4l-7.8-11.9 7.2-10.5h3.5V0z M161.4 2.1h2.6v22.3h-2.6v2.1h8.7v-2.1h-2.3V15h4c1.5 0 1.6.2 2.1 1.7l.1.2v.2h1.7v-6.6H174v.2c0 .2-.1.3-.2.4-.5 1.2-.7 1.8-2.1 1.8h-4V2.1h9.1c1.7 0 2.1.5 3.1 3v.2h2V0h-20.6l.1 2.1z M108.2 16.1c-.8 5.2-3.6 8.1-7.9 8.1-5.1 0-8.2-4.3-8.2-11.1 0-6.5 3.1-10.7 7.9-10.7 4.6 0 7 2.9 8.1 5.4l.1.2h1.8V0h-2l-.7 2.8C105.4.9 103 0 100.1 0c-6.8 0-12 5.7-12 13.3 0 3.9 1.2 7.2 3.3 9.6 2.1 2.4 5.1 3.6 8.4 3.6 6 0 9.8-3.7 10.6-10.4v-.3h-2.3v.3z") + + symbol#svg_chartbeat(viewBox="0 0 915.7 130.1") + path(fill="#51a4da" d="M157.8 8.6c-8.1 6.8-15.5 13-23 19.1-.8.7-2.2.7-3.4.7H23.3c-.8 0-1.6-.3-2.6-.4V8.6h137.1z") + path(fill="#b3e4fa" d="M0 33.9c2-.1 4.1-.3 6.1-.3H129c-8.7 7.2-16.2 13.5-23.8 19.6-.8.6-2.3.7-3.4.7H2.4c-.8 0-1.7-.3-2.5-.5.1-6.5.1-13 .1-19.5z") + path(fill="#5bc4bf" d="M36 79.2V59.4h63.2C91 66.3 83.6 72.5 76.1 78.7c-.5.4-1.3.5-1.9.5H36z") + path(fill="#657d8c" d="M613.3 49.4c6.5-3.8 12.5-8.7 19.3-10.9 19.6-6.4 39.7 2.9 48 21.8 6.8 15.4 6.3 31-1.3 46-12.1 24.2-47.3 28-66 8-.1 2-.3 3.6-.3 5.2.1 4.3-2 6.4-6.2 6.3-4.2 0-6.2-2.2-6.2-6.4V8.2c0-4.3 2-6.1 6.3-6.1 4.4 0 6.2 2.1 6.2 6.3.2 13.9.2 27.7.2 41zm-.2 30.1c0 2.6-.1 5.1 0 7.7.5 19.7 19.2 33.3 38.2 27.7 13.9-4.1 22.5-18.7 21-35.8-1.2-13.7-6.6-24.9-20.5-29.7-9.8-3.4-19.3-1.6-27.7 4.4-8.7 6.3-13.1 14.7-11 25.7z M561.7 39h27.7c.3 8.4-2.3 11.2-9.9 11.2-5.3 0-10.6 0-15.9.1-.3 0-.6.2-1.3.3-.2 1-.5 2.1-.5 3.2v45.6c0 1.5.1 3.1.2 4.6 1.2 10.6 8.8 15.1 18.7 10.8 3.5-1.5 7-4.1 9.7.8 1.8 3.2-.7 6.9-5.8 9.4-16.6 8-34.1-1.7-34.8-20-.7-16-.2-32.1-.3-48.2v-6.1c-7.6 0-14.6-.6-21.5.2-7.7.9-13.5 5.7-16.1 12.8-2.4 6.6-3.9 13.8-4.4 20.8-.7 11.4-.1 22.9-.2 34.3-.1 5.5-2.7 7.7-7.6 7-4.5-.6-5.2-3.8-5.2-7.5V72.2 45.5c0-4.3 1.1-7.4 6.2-7.5 5.1-.1 6.4 3 6.4 7.3 0 3.1 0 6.1.4 9.9 9-17.9 25.3-17 41.9-16.2 0-5.1.1-9.5 0-13.9s1.6-6.8 6.2-6.7c4.8 0 6.2 2.7 6.1 6.9-.1 4.3 0 8.6 0 13.7z M267.1 127.1c-36.6 0-61.2-28.5-57.6-66.8 4.8-50.7 52.1-62.9 83-48.9 6.3 2.8 11.5 8.2 16.6 13.1 2.6 2.5 3.4 6.6-.4 9.5-3.6 2.8-6.5 1.2-9.3-1.8-12.7-13.6-28.3-17.6-45.8-12.3-17.9 5.4-27 18.9-30.1 36.7-2.4 13.4-.5 26.3 6.5 38.1 14.1 23.7 48.5 28.7 67.2 9.7 2-2 3.8-4.4 5.3-6.9 1.8-2.9 4.4-4 6.9-2.3 1.9 1.3 4 4.7 3.5 6.4-1.1 3.8-3.2 7.6-5.9 10.5-10.8 11-24.7 15-39.9 15z M704.6 85.7c1.5 13.7 7.5 23.6 20 28 13 4.5 25.6 3.4 36.1-6.9 5.2-5.1 7.6-5.9 10.5-3.1 2.8 2.8 1.9 7-2.4 11.7-.6.6-1.2 1.2-1.9 1.8-15.4 11.7-32.5 13.2-49.8 6-16.7-6.9-24.3-20.9-24.9-38.5-.9-24 11.8-42.2 33-46.7 28.6-6.1 50.8 11.2 50.4 43-.1 4.4-2.9 4.8-6.2 4.8h-58.9c-1.9-.1-3.7-.1-5.9-.1zm1.2-11.4h55.9c-.1-15.3-11-26.2-26-26.3-15.6-.2-29.6 12.1-29.9 26.3zM842.8 73.1c1.4-14-5.1-22.5-18.2-24.3-11.1-1.5-20.7 1.6-28 10.3-2.5 2.9-4.9 5.1-8.6 2.4-3.7-2.8-2.7-6.3-.5-9.5 1.5-2.2 3.3-4.5 5.4-6.1 14.4-10.7 30.2-12.1 46.5-5.1 9.5 4.1 15.2 12.3 15.6 22.5.9 18.9.7 37.9.8 56.9 0 3.9-2.3 5.9-6.3 5.8-4-.1-6-2-5.8-6.1.1-1.8 0-3.7 0-6.5-1.6 1.3-2.4 1.9-3.2 2.6-12.4 11.4-26.8 13.7-42.4 8.8-9.1-2.9-14.5-9.4-15.5-19.1-1-9.9 2.6-17.8 11.3-22.9 10.8-6.3 22.9-7.7 35-8.7 4.5-.5 9.1-.7 13.9-1zm-.2 9.7c-9.8 1.2-19.4 2.1-28.9 3.6-3.8.6-7.5 2.2-10.9 4-5.9 3.1-8.4 8.4-7.4 14.4 1 6.2 5.3 9.5 11 10.7 17.7 3.9 40.5-6.1 36.2-32.7z M338.5 50.2c.7-1.1 1.3-2.4 2.2-3.3 10.5-10.7 23.3-12.4 36.9-8.2 13.3 4.1 20 14.6 20.9 27.7 1.2 18 .8 36.2.9 54.3 0 4-2.5 5.4-6.1 5.2-3.9-.1-5.8-2.1-5.8-6.2.1-13.7.1-27.3 0-41 0-3.2-.2-6.5-.7-9.7-1.9-11.5-8.4-18.5-18.2-20-12.1-1.8-23.5 3.1-28.1 13.2-2.1 4.7-3.2 10.1-3.4 15.3-.5 13.7-.1 27.3-.2 41 0 6-3.2 8.7-8.8 7.1-1.8-.5-3.2-2.9-4.5-4.6-.5-.7-.2-2-.2-3V9.9c0-6.2 1.2-7.5 6.3-7.6 5.3-.1 7.1 1.4 7.1 6.9.1 11.8 0 23.6 0 35.4 0 1.6.1 3.3.2 4.9.7.2 1.1.5 1.5.7z M469 73.1c1.3-13.6-5.3-22.3-17.9-24.2-11.3-1.7-21 1.4-28.5 10.2-2.5 2.9-5 5.1-8.6 2.4-3.7-2.8-2.7-6.4-.5-9.6 6.2-9.2 15.4-13.3 25.9-14.6 5.2-.7 10.6-.7 15.8.1 16.6 2.7 26.4 14.3 26.5 31.3.2 16.6.1 33.1 0 49.7 0 5.6-1.6 7.5-6 7.5-5 0-6.4-3.1-6.1-7.5.1-1.4 0-2.7 0-4.8-1.3 1-2.3 1.5-3 2.2-12.1 11.4-26.4 13.7-41.8 9.1-9.8-2.9-15.5-9.9-16.2-20.2-.9-10.1 3.4-17.8 12.4-22.7 10.6-5.7 22.3-7.1 34.1-8.1 4.6-.3 9.2-.5 13.9-.8zm0 9.9c-8.8.9-17.4 1.5-25.9 2.9-4.8.8-9.6 2.4-14 4.6-6.3 3.1-8.8 8.6-7.7 14.7.9 5.3 5.2 9.5 11.7 10.7 18.7 3.1 39.3-7.4 35.9-32.9z M63.9 127.4c-5.1-1.2-8.2-3.2-9.7-7.3-1.7-4.6-.3-8.3 3.2-11.5C68 98.9 78.6 89.2 89.1 79.5c24.2-22.1 48.4-44.3 72.7-66.4.5-.5.9-1.2 1.5-1.3 2-.6 4.1-1 6.1-1.5-.6 2.1-.5 4.7-1.8 6.1-31.8 35.3-63.8 70.4-95.8 105.5-2 2.3-5.2 3.7-7.9 5.5z M873.9 49.4h-8.8c-3.2 0-5.1-2-4.4-4.9.5-2 2.3-4.5 4.2-5.3 2.4-.9 5.3-.2 9-.2 0-4.6-.1-8.8 0-12.9.1-5.9 1.7-7.6 6.5-7.7 5.3-.1 6.1 3.3 6.1 7.4v12.9h27.8c-.2 8.1-2.7 10.6-9.7 10.7h-18.3v12.9l.3 35.9c0 1.5 0 3.1.2 4.6.9 12 8.5 16.6 19.5 11.6 3.3-1.5 6.6-3.2 8.8 1.1 2.1 4-.9 6.4-3.8 8.4-14.4 9.7-34.8 1-36.3-16.2-1.3-14.2-.8-28.7-1-43-.1-4.8-.1-9.6-.1-15.3z") + + symbol#svg_allenai(viewBox="0 0 610 103") + path(fill="#FFE266" d="M87.446 21.648c-.8 0-1.6-.014-2.397.01-.177.006-.42.11-.5.247-.062.104.047.363.152.497.63.803 1.292 1.58 1.917 2.388.115.15.21.452.135.574-.08.13-.38.194-.57.174-1.09-.113-2.173-.263-3.26-.395-.607-.074-.776.146-.548.71.447 1.11.9 2.218 1.35 3.327.09.222.218.49-.05.626-.15.078-.43.04-.593-.052-1.16-.65-2.303-1.333-3.458-1.993-.39-.222-.52-.504-.41-.955.3-1.227.998-2.204 1.893-3.046 1.4-1.317 2.212-2.923 2.533-4.812.756-4.46-2.55-8.847-6.994-9.385-4.702-.57-9.366 3.225-9.05 8.6.128 2.157.943 4.006 2.517 5.508.923.882 1.675 1.868 1.98 3.143.1.42.002.71-.38.93-1.166.67-2.32 1.36-3.495 2.018-.16.09-.398.043-.6.06-.014-.204-.096-.432-.03-.605.422-1.104.877-2.194 1.32-3.29.26-.648.097-.86-.586-.774-1.07.137-2.14.28-3.215.39-.187.02-.49-.062-.56-.19-.068-.13.04-.42.156-.573.607-.783 1.252-1.537 1.86-2.318.12-.155.24-.434.177-.57-.062-.136-.355-.228-.548-.23-1.54-.013-3.078 0-4.617.005-.255 0-.583.026-.59-.296-.006-.18.212-.436.397-.538 1.243-.684 2.508-1.326 3.753-2.007.177-.097.282-.33.42-.498-.18-.127-.342-.323-.54-.373-1.52-.377-3.048-.716-4.567-1.097-.185-.046-.327-.266-.49-.405.163-.14.306-.36.49-.402 1.698-.395 3.405-.752 5.105-1.142.188-.043.345-.224.517-.34-.12-.17-.2-.393-.36-.5-1.32-.874-2.654-1.726-3.983-2.584-.063-.04-.152-.064-.184-.12-.09-.167-.158-.345-.235-.518.174-.076.358-.234.518-.212 1.084.145 2.163.328 3.244.5.467.074.932.172 1.402.212.177.015.43-.053.528-.178.073-.093.003-.37-.087-.51-.944-1.484-1.906-2.958-2.863-4.435-.04-.062-.108-.12-.117-.186-.026-.2-.03-.4-.04-.602.2.002.444-.07.596.018 1.178.683 2.336 1.396 3.5 2.1.494.297.98.613 1.486.885.156.084.44.132.55.047.112-.088.146-.372.105-.545-.45-1.943-.92-3.88-1.392-5.82-.063-.26-.12-.51.176-.65.315-.15.455.108.598.31 1.164 1.642 2.32 3.29 3.494 4.92.12.17.354.26.535.387.12-.2.316-.39.347-.602.45-2.39 1.082-4.377 1.354-5.93.04-.268.08-.543.4-.555.36-.013.39.28.43.557.336 1.762 1.266 3.96 1.33 5.71.043.28.156.69.352.777.387.172.517-.284.694-.53 1.093-1.53 2.174-3.065 3.26-4.6.05-.07.087-.172.157-.21.174-.094.364-.16.547-.237.063.19.214.402.176.57-.366 1.614-.763 3.22-1.15 4.83-.087.36-.197.715-.247 1.08-.026.178 0 .455.11.535.116.083.398.037.548-.052 1.628-.964 3.245-1.95 4.865-2.928.228-.138.5-.353.69-.047.093.15.026.484-.086.66-.93 1.478-1.893 2.932-2.827 4.406-.12.19-.13.45-.19.676.245.045.498.16.73.126 1.448-.21 2.89-.454 4.336-.686.25-.04.522-.102.633.225.107.32-.144.45-.354.587-1.34.868-2.69 1.725-4.02 2.607-.168.11-.25.353-.373.533.174.11.334.28.523.322 1.628.37 3.26.717 4.893 1.073.115.025.25.024.34.088.13.095.323.245.313.355-.013.146-.17.348-.312.4-.372.133-.77.2-1.158.29-1.18.275-2.363.536-3.537.835-.21.053-.378.262-.566.4.146.163.262.383.445.482 1.206.66 2.426 1.292 3.64 1.935.066.035.154.054.192.108.115.164.21.342.313.514-.166.09-.33.252-.5.26-.797.026-1.597.01-2.396.01.002-.003.002-.008.002-.012z") + path(fill="url(#gradient_allenai1)" d="M71.603 33.652c-.09-.64-.137-1.244-.264-1.832-.712-3.286-1.683-6.49-3.134-9.532-.502-1.054-1.07-2.076-1.304-3.233-.475-2.357-.127-4.583 1.133-6.643 1.34-2.192 3.3-3.552 5.795-4.066 2.904-.598 5.526.072 7.798 2.007 1.602 1.365 2.556 3.113 2.95 5.163.406 2.103.066 4.105-.988 5.968-.82 1.452-1.652 2.897-2.143 4.503-.34 1.117-.653 2.24-.99 3.36-.034.11-.138.253-.236.28-2.556.677-4.873 1.887-7.142 3.203-.476.276-.963.537-1.475.822z") + path(fill="#3D3D3D" d="M610.03 87.8v2.818h-13.405v-17.93h13.277v2.817h-10.13v4.663h8.98v2.818h-8.98V87.8zM584.123 75.272c-3.404 0-5.912 2.818-5.912 6.328v.05c0 3.51 2.484 6.354 5.913 6.354 2.2 0 3.607-.87 5.14-2.307l2.024 2.05c-1.866 1.946-3.914 3.177-7.27 3.177-5.217.004-9.104-4.07-9.104-9.22v-.053c0-5.097 3.81-9.27 9.233-9.27 3.303 0 5.298 1.15 7.013 2.792l-2.022 2.327c-1.435-1.332-2.967-2.228-5.015-2.228zM566.425 90.618L556.55 77.86v12.758h-3.1v-17.93h2.919l9.6 12.4v-12.4h3.09v17.93zM547.68 87.8v2.818h-13.406v-17.93h13.28v2.817H537.42v4.663h8.98v2.818h-8.98V87.8zM520.52 83.345V80.6h7.627v7.535c-1.766 1.51-4.298 2.794-7.472 2.794-5.608 0-9.292-4-9.292-9.226v-.052c0-5.02 3.838-9.27 9.236-9.27 3.12 0 5.04.87 6.885 2.433l-1.996 2.382c-1.385-1.178-2.737-1.922-5.015-1.922-3.303 0-5.81 2.896-5.81 6.328v.05c0 3.688 2.434 6.406 6.094 6.406 1.685 0 3.22-.54 4.322-1.36v-3.355h-4.58zM502.203 72.687h3.146V90.62h-3.147zM484.148 90.618v-17.93h3.147v15.06h9.418v2.87zM466.272 90.618v-17.93h3.15v15.06h9.413v2.87zM460.495 87.8v2.818h-13.402v-17.93h13.274v2.817h-10.13v4.663h8.982v2.818h-8.984V87.8zM436.045 75.606v15.012h-3.17V75.606h-5.685v-2.92h14.533v2.92zM419.19 90.618l-9.874-12.758v12.758h-3.097v-17.93h2.915l9.595 12.4v-12.4h3.095v17.93zM396.25 72.687h3.145V90.62h-3.146zM368.475 90.618v-17.93h3.146v15.06h9.415v2.87zM355.488 72.56h-2.92l-7.878 18.06h3.225l1.842-4.33h8.47l1.814 4.33h3.327l-7.88-18.06zm-4.58 10.936l3.068-7.172 3.097 7.172h-6.166zM336.25 72.687h3.145V90.62h-3.144zM323.568 75.272c-3.4 0-5.91 2.818-5.91 6.328v.05c0 3.51 2.484 6.354 5.91 6.354 2.202 0 3.61-.87 5.143-2.307l2.024 2.05c-1.867 1.946-3.916 3.177-7.27 3.177-5.218.004-9.104-4.07-9.104-9.22v-.053c0-5.097 3.81-9.27 9.234-9.27 3.302 0 5.298 1.15 7.01 2.792l-2.02 2.327c-1.434-1.332-2.967-2.228-5.016-2.228zM305.18 72.687h3.143V90.62h-3.144zM289.323 75.557v4.89h9.032v2.87h-9.032v7.3h-3.146v-17.93h13.33v2.87zM276.203 72.687h3.146V90.62h-3.147zM264.98 75.606v15.012h-3.173V75.606h-5.682v-2.92h14.534v2.92zM247.33 83.83c2.483-.718 4.25-2.483 4.25-5.43v-.052c0-1.562-.537-2.896-1.485-3.868-1.148-1.128-2.916-1.794-5.168-1.794h-7.982v17.932h3.146v-6.25h3.94l4.403 6.25h3.734l-4.836-6.788zm-7.237-2.254v-6.02h4.58c2.328 0 3.71 1.05 3.71 2.97v.054c0 1.816-1.434 2.996-3.686 2.996h-4.604zM223.96 72.56h-2.92l-7.883 18.06h3.226l1.842-4.33h8.47l1.816 4.33h3.327l-7.878-18.06zm-4.583 10.936l3.068-7.172 3.098 7.172h-6.166zM199.913 79.702c-.742 0-.972-.485-.972-.947 0-.486.13-.616.13-.744 0-.178-.103-.253-.333-.253-1.202 0-3.326 2.74-3.684 3.945l-1.866 6.353h-1.994l2.966-10.09h-1.767l.102-.412h.948c1.306 0 2.354-.053 3.044-.562l-1.05 3.175.05.053c.694-1.254 2.508-3.074 4.12-3.074 1.126 0 1.636.616 1.636 1.28 0 .845-.69 1.278-1.33 1.278zM184.16 76.99c-3.785 0-6.65 3.69-6.65 6.89 0 2.486 1.33 4.484 4.145 4.484 3.07 0 6.422-3.098 6.422-7.25 0-2.588-2.126-4.124-3.916-4.124zm.025 9.428c-.23.36-1.02 1.538-2.608 1.538-1.46 0-2.226-1-2.226-2.23 0-1.205 1.383-5.635 1.92-6.558.563-.974 1.435-1.768 2.737-1.77 1.458 0 2.226.794 2.226 2.052 0 1.487-1.382 5.842-2.048 6.968zM179.1 71.816c-.743 0-1.23-.538-1.23-1.1 0-.69.41-.82.41-1.05 0-.21-.207-.21-.36-.21-1.535 0-2.915 3.947-3.25 5.18l-.793 2.92h2.688l-.076.41h-2.687c-1.51 5.226-2.662 10.144-3.814 12.474-1.74 3.536-2.765 4.228-4.375 4.228-.87 0-1.538-.77-1.538-1.485 0-.77.566-1.278 1.152-1.278.69 0 1.15.433 1.15 1.05 0 .742-.487.742-.487 1.05 0 .18.156.255.384.255.436 0 1.434-.385 2.124-2.51.897-2.767 2-9.017 3.305-13.782h-2.254l.105-.41h2.252c.742-2.92 1.968-5.204 3.352-6.64 1.253-1.278 2.33-1.866 3.453-1.866.92 0 1.712.64 1.712 1.588 0 .69-.56 1.176-1.226 1.176zM517.48 56.98v2.395h-16.987v-23.3h16.818v2.398h-14.19v7.955h12.698v2.398H503.12v8.154zM484.43 38.508v20.87h-2.66v-20.87h-7.81v-2.434h18.285v2.434zM456.373 59.742c-5.752 0-9.676-3.525-9.676-10.117v-13.55h2.628v13.384c0 5.024 2.69 7.854 7.11 7.854 4.293 0 7.02-2.594 7.02-7.69v-13.55h2.624v13.35c0 6.793-3.892 10.318-9.707 10.318zM431.003 38.508v20.87h-2.656v-20.87h-7.814v-2.434h18.28v2.434zM409.43 36.074h2.623v23.303h-2.624zM393.133 38.508v20.87h-2.656v-20.87h-7.814v-2.434h18.284v2.434zM368.566 46.467c5.185 1.132 7.578 3.027 7.578 6.588v.066c0 3.996-3.32 6.593-7.945 6.593-3.693-.002-6.716-1.232-9.508-3.732l1.63-1.93c2.427 2.196 4.753 3.3 7.98 3.3 3.123 0 5.185-1.67 5.185-3.963v-.068c0-2.163-1.163-3.396-6.052-4.427-5.352-1.164-7.81-2.896-7.81-6.723v-.065c0-3.662 3.224-6.358 7.646-6.358 3.39 0 5.82.967 8.18 2.862l-1.528 2.03c-2.162-1.765-4.324-2.53-6.718-2.53-3.024 0-4.954 1.665-4.954 3.762v.067c0 2.195 1.197 3.426 6.316 4.527zM348.22 59.377L333.19 40.27v19.107h-2.56V36.074h2.46l14.66 18.676V36.074h2.56v23.303zM317.865 36.074h2.626v23.303h-2.625zM292.33 59.377L277.306 40.27v19.107h-2.56V36.074h2.46l14.66 18.676V36.074h2.56v23.303zM266.13 56.98v2.395h-16.987v-23.3h16.822v2.398H251.77v7.955h12.7v2.398h-12.7v8.154zM225.24 59.377V36.074h2.625v20.874h13.1v2.43zM201.335 59.377V36.074h2.626v20.874h13.1v2.43zM182.815 35.908h-2.46l-10.604 23.47h2.693l2.76-6.16h12.664l2.726 6.16h2.828l-10.605-23.47zm-6.582 14.914l5.32-11.848 5.284 11.848h-10.604z") + path(fill="url(#gradient_allenai2)" d="M108.15 90.71l15.184-23.13c5.5-8.402 8.25-15.42 8.25-21.055 0-6.14-2.144-10.94-6.428-14.4-4-3.227-10.812-4.947-17.815-5.163v-.023c-.167 0-.335.008-.503.01-.16-.003-.322-.01-.483-.01l.003.022C86.763 27.352 66 39.38 48.067 53.756L39.15 28.02H25.99L0 103.018h14.627S25.14 88.742 40.887 73.58l9.625 29.437h14.626l-13.546-39.09c5.52-4.648 11.43-9.11 17.588-12.973v52.063h13.16V43.88c8.4-3.722 17.02-5.946 25.504-5.757 6.715 0 10.073 2.802 10.073 8.402 0 4.725-2.346 10.544-7.035 17.46l-23.484 34.57v3.846h48.59V90.71h-27.84zM23.33 76.395l9.237-28.252 4.81 14.715c-5.06 4.56-9.782 9.158-14.048 13.537z") + path(fill="#3181AF" d="M71.614 33.688c.03.13.08.244.14.35 2.87-1.623 5.767-3.1 8.676-4.432.037-.115.08-.228.113-.343.037-.13.072-.26.11-.39H70.58c.25.998.478 2.003.658 3.02.107.602.24 1.2.376 1.795z") + + + //- spaCy features + + symbol#svg_thoughtworks(viewBox="35 20 150 25") + path(fill="#001e2b" d="M175.1 28.47a2 2 0 1 1 2 2 2 2 0 0 1-2-2m3.46 0A1.47 1.47 0 1 0 177.1 30a1.45 1.45 0 0 0 1.46-1.53m-.48 1.1h-.52l-.48-.92h-.36v.92h-.46V27.4h1.05c.57 0 .84.16.84.67 0 .4-.2.56-.58.58zm-.77-1.24c.26 0 .42-.05.42-.32 0-.3-.3-.26-.5-.26h-.5v.6zM49.68 24.5h-4.6v14.74h-3.56V24.5h-4.6v-3h12.76v3M54.4 27.8a6.14 6.14 0 0 1 3.8-1.6c1.4 0 3.36.84 3.36 3.98v9.05h-3.38v-7.68c0-1.02.08-2.64-1.38-2.64a3.5 3.5 0 0 0-2.4 1.4v8.94h-3.37V21.5h3.37v6.3M62.95 32.85c0-3.4 1.87-6.65 5.6-6.65s5.62 3.26 5.62 6.65c0 3.38-1.9 6.66-5.6 6.66s-5.62-3.27-5.62-6.65m7.76 0c0-1.6-.32-4.17-2.14-4.17s-2.14 2.58-2.14 4.17.32 4.17 2.14 4.17 2.14-2.57 2.14-4.17M82.8 37.92a6.16 6.16 0 0 1-3.8 1.6c-1.4 0-3.35-.86-3.35-4v-9.05h3.37v7.68c0 1.03-.06 2.65 1.4 2.65a3.5 3.5 0 0 0 2.4-1.4v-8.93h3.37v12.77h-3.4v-1.32M91.7 35.04a.87.87 0 0 0-.96.8c0 1.66 8.85-1.23 8.85 3.76 0 1.9-1.3 3.9-6.35 3.9-4.4 0-5.9-1.55-5.9-3.2a2.2 2.2 0 0 1 1.4-1.95 2.03 2.03 0 0 1-1-1.62 2.9 2.9 0 0 1 1.65-2.55 4.8 4.8 0 0 1-1.6-3.5c0-2.84 2.4-4.48 5.15-4.48a6.13 6.13 0 0 1 3.15.9 3.13 3.13 0 0 1 2.3-1.06 4.97 4.97 0 0 1 .98.1v2.36a3 3 0 0 0-1.07-.27 1.86 1.86 0 0 0-.9.27 5.1 5.1 0 0 1 .68 2.17c0 2.82-2.37 4.46-5.13 4.46l-1.25-.1m-1.07 4.23a.88.88 0 0 0-.53.8c0 1.18 2.4 1.25 3.2 1.25.57 0 3.53-.06 3.53-1.08 0-.66-.45-.6-2.46-.7zm2.32-6.36c1.42 0 1.83-1.16 1.83-2.37 0-1.17-.55-2.15-1.83-2.15-1.4 0-1.84 1.2-1.84 2.4 0 1.16.55 2.12 1.85 2.12M104.2 27.8a6.14 6.14 0 0 1 3.82-1.6c1.4 0 3.35.84 3.35 3.98v9.05H108v-7.68c0-1.02.07-2.64-1.4-2.64a3.5 3.5 0 0 0-2.4 1.4v8.94h-3.36V21.5h3.37v6.3M117.5 26.47h2.63v2.8h-2.62v6.5c0 .62.2 1.25 1.28 1.25a2.4 2.4 0 0 0 1.35-.47v2.57a7.83 7.83 0 0 1-2.17.4c-2.1 0-3.83-1.2-3.83-3.98v-6.27h-1.57v-2.8h1.57v-3.2h3.38v3.2M134.17 39.24h-1.82l-3-14.5h-.06l-3.18 14.5h-1.82l-3.72-17.74h1.78l2.92 14.32h.05l3.2-14.32h1.74l3.04 14.32h.05l3.12-14.32h1.75l-4.05 17.74M137.46 32.94c0-4.7 2.23-6.65 5.03-6.65s5.03 1.93 5.03 6.64-2.24 6.66-5.04 6.66-5.04-1.94-5.04-6.66m8.3 0c0-2.14-.53-5.15-3.27-5.15s-3.26 3-3.26 5.14.52 5.16 3.25 5.16 3.26-3 3.26-5.16M150.7 29.36h.05c.7-1.5 1.6-2.9 3.48-2.9h.7v1.63c-2.34-.33-3.55 2.18-4.23 4v7.14h-1.78v-12.6h1.78v2.72M158.13 32.5l4.68-5.85h2l-3.48 4.3 4 8.3h-1.92l-3.22-6.85-2.05 2.53v4.3h-1.78V21.5h1.78v11M173.02 29.46a3.26 3.26 0 0 0-2.88-1.67c-1.16 0-2.25.58-2.25 1.86 0 3.08 6.62 1.73 6.62 5.97a3.84 3.84 0 0 1-4.12 3.97 5.17 5.17 0 0 1-4.7-2.64l1.4-.9a3.5 3.5 0 0 0 3.3 2.04 2.2 2.2 0 0 0 2.46-2.14c0-3.17-6.63-1.66-6.63-6.18a3.56 3.56 0 0 1 3.85-3.5 4.83 4.83 0 0 1 4.2 2.2l-1.26.98") + + symbol#svg_wapo(viewBox="0 0 268 206") + path(fill="#111" d="M118.13 17.98v25.04c3.4-1.83 5.76-5 6.9-8.6l.28.14c-.74 8.93-6.83 17.86-16.3 17.86-9.54 0-16.58-7.1-16.58-18.47 0-8.53 5.55-13.88 12.65-18.68-1-.27-2.1-.4-3.18-.4-4.88 0-7.72 3.3-7.72 6.7h-.4c-.08-.48-.08-.95-.08-1.43 0-6.02 3.2-13.33 10.9-13.33 4.94 0 8.73 4.54 14.48 4.54 2.3 0 5.15-1.08 6.3-3.58h.2c-.07 4.4-1.42 8.93-7.45 10.22zm3.66 25.64c-3.4 3.6-7.05 6.64-11.8 6.64-8.78 0-15.55-6.9-15.55-16.92 0-5.9 2.44-9.07 5.14-12.8h-.28c-1.7 1.7-6.16 5.96-6.16 13.2 0 11.1 6.97 17.94 16.1 17.94 6.37 0 10.7-4.4 12.66-8l-.13-.05zm2.7-32.81c-1.16 2.17-3.6 4.94-8.53 4.94-4.94 0-9.48-3.38-13.47-3.38-3.65 0-6.16 2.44-7.1 4.26l.06.14c1.3-1.3 3.25-2.57 6.43-2.57 5.27 0 8.65 3.24 14 3.24 5.7 0 8.26-3.73 8.73-6.57l-.14-.06zm-18.82 4.67c-3.45 3.6-6.63 7.52-6.63 15.16 0 4.2 1.42 8.53 4.6 11.24l1.96-1.02V22.7l-1.8.96-.35-.68 9.47-5.14c-2.57-.54-4.87-1.7-7.24-2.37zm11.7 2.64c-.47.08-.94.08-1.48.08-.95 0-1.83-.07-2.7-.27v19.77l-8.87 4.8c1.83 1.36 4.06 2.17 6.9 2.17 2.3 0 4.33-.48 6.16-1.3V18.12zm-9.74 3.46l-1.28.75v18.14l1.28-.67v-18.2zm38.9 2.58v23.4c0 6.5-5.74 10.9-12.17 12.2l-.13-.28c3.18-1.56 5-4.74 5-8.12V27.1l-3.65-3.3-1.43 1.55v18.88l2.03 1.83v.14l-5.48 5.95-6.3-5.75v-.27l2.44-2.57v-29.9L134.04 6l.13.07V24.2l5.9-6.36 6.15 5.55 1.15-1.23.54.48-1.35 1.5zm-18.93 22.12v-.13l1.43-1.42V12.5L127.6 14v30.03l-2.16 2.17v.13l5.28 4.74.95-1.08-4.06-3.74zm13.8-20.23l-4.4-3.8-.93 1.02 3.92 3.52v24.37c0 1.62-.35 3.3-.96 4.26l.07.07c1.7-1.22 2.3-3.05 2.3-5.42V26.03zm15.9 11.7v3.46l6.58 5.13 3.8-4.06.53.54-8.8 9.42-8.92-7.18-1.36 1.43-.54-.55 1.56-1.62V27.18l12.8-9.34 6.7 10.83-12.33 9.07zm-5 5.96V26.5l-1.42 1.02v16.85l8.4 6.83 1.08-1.14-8.05-6.37zm5.15-20.85l-.14.07V36.8l6.04-4.4-5.9-9.55zm1.76-1.35l-1.2.88 5.95 9.54 1.22-.88-5.96-9.54zM31.74 115.16l-9.4-8.13-7.18 8.13-9.75-8.4V93.5H3.53c-1.7 0-2.57 1.08-2.98 2.37h-.2C.2 95.33 0 94.37 0 93.3c0-1.76.47-6.36 5.4-6.36V75.3c0-2.24-2.63-3.12-2.63-6.1 0-3.85 3.66-7.64 10.36-10l.27.2c-2.44 1.4-3.72 2.77-3.72 5.68 0 4.46 4.33 3.3 4.33 10.35v2.7l7.93-8.32 8.32 8.2 7.85-8.2 7.58 7.46v25.04l-13.94 12.86zm-24.16-9.48V91.4h-3.8c-1.95 0-2.83 1.3-2.97 2.44l.08.07c.68-.74 1.35-1.14 2.9-1.14h2.38v13.67l8.93 7.7 1-1.2-8.52-7.25zm0-31.06c0-2.78-2.44-3.72-2.44-5.48 0-2.44.95-4.47 2.44-6.36l-.14-.07c-2.03 1.57-3.92 4.07-3.92 6.5 0 2.44 2.64 3.6 2.64 5.76v11.98h1.42V74.62zm14.28 6.77l-5.14-5.02-2.7 2.77v23.96l6.15 5.22 1.7-1.9V81.4zm2.16 24.15v-25.1l-5.75-5.7-1.08 1.1 5.4 5.2v25.24l9.14 7.9 1.08-1-8.8-7.65zM37.15 80.5l-4.26-4.05-2.45 2.5v24.1l6.64 5.8.05-.06V80.5zm2.17-.94l-4.94-4.8-1.02 1.08 4.54 4.33v28.3l1.42-1.37V79.56zM68.07 115.16l-6.43-5.82-5.75 5.8-7.05-6.2-.94 1-.53-.54 1.2-1.28V89.98l12.74-9.2 7.3 5.68 1.3-1.35.54.55-1.5 1.56v18.75l3.6 3.32 1.15-1.22.53.47-6.16 6.64zm-17.32-7.52V89.37l-1.42 1.02v17.92l6.5 5.82 1.08-1.08-6.15-5.42zm10.9-17.8l-5.56-4.33-.2.15v19.7l4.6 4 1.14-1.16V89.84zm2.16 18.4v-19.4l-5.74-4.68-1.36.95 5.7 4.48v19.35l5.67 5.14L69.1 113l-5.3-4.75zM87.77 93.1h6.97v14.47l-10.76 7.58c-1.42-1.62-3.45-2.77-5.95-2.77-2.03 0-3.66.68-5.82 2.5l-.34-.13 9.2-13.4h-5.68v-12.6l11.3-7.84c1.35 1.1 2.37 1.7 4.13 1.7 1.36 0 3.32-.27 4.8-1.48l.28.13-8.12 11.84zM80.2 110.4c-1.98 0-3.53.68-4.9 1.62v.14c1.1-.48 2.18-.6 3.2-.6 1.55 0 4 .74 5.6 2.63l1.57-1.14c-1.3-1.5-2.85-2.64-5.48-2.64zm7.22-11.37h-3.8l-7.9 11.44.06.12c1.97-1.42 3.86-2.37 6.16-2.37 2.03 0 4.13.95 5.42 2.57l.06-.06v-11.7zm-9.87 0v-10.9l-1.43 1.02v11.37h5.48l1.1-1.5h-5.16zM91.7 85.78c-1.43 1.02-2.24 1.56-4.6 1.56-1.37 0-3-.6-4.28-1.96l-.13.07v9.88h2.56l6.5-9.48-.07-.06zm-3.26-.6c-1.22 0-2.77-.6-3.8-1.97l-1.68 1.16c1.28 1.56 2.5 2.24 4.4 2.24 1.2 0 2.97-.34 4.8-1.9l.07-.2c-1.15.47-2.57.67-3.8.67zm1.15 11.64h-4.4l-1.02 1.42h4v13.13l1.4-1.02V96.82zm28.48-9.68v23.4c0 6.5-5.75 10.9-12.18 12.2l-.14-.27c3.18-1.56 5-4.74 5-8.13V90.13l-3.64-3.32-1.42 1.56v18.88l2.03 1.82v.14l-5.48 5.96-6.3-5.76v-.27l2.44-2.57v-29.9l7.16-7.66.14.07V87.2l5.9-6.36 6.14 5.55 1.15-1.22.53.47-1.35 1.5zm-18.95 22.13v-.14l1.42-1.42V75.5L99.13 77v30.03l-2.16 2.17v.14l5.27 4.73.95-1.08-4.07-3.73zm13.8-20.24l-4.4-3.8-.94 1.03 3.92 3.52v24.36c0 1.62-.34 3.32-.95 4.26l.07.07c1.7-1.22 2.3-3.05 2.3-5.4V89.02zm17.47 26.13l-5.82-5.35-1.36 1.43-.54-.54 1.56-1.7V89.44l-2.84-2.7-1.1 1.2-.53-.53 6.1-6.62 5.4 4.87 1.36-1.5.6.48-1.7 1.83v19.76l3.26 3.05 1.48-1.56.54.55-6.42 6.9zm-4-6.64V88.36l-3.52-3.25-.94 1.1 3.04 2.9v20.1l5.42 4.87 1-1.08-5-4.48zm2.03-28.28l-5.55-5.08 5.08-5.4 5.55 5.06-5.07 5.42zm-3.45-6.16l-1 1 4.4 4.14 1-1.15-4.4-4zM159 115.16l-5.88-5.35V90.06l-3.6-3.25-1.88 2.03v18.27l2.1 1.96v.27l-5.55 5.82-6.3-5.75v-.2l2.43-2.57V89.57l-3.18-2.9-1.3 1.4-.53-.53 6.16-6.7 6.16 5.5v1.34l6.43-6.84 6.02 5.42 1.34-1.42.6.54-1.6 1.7v19.34l3.03 2.85 1.5-1.63.53.54-6.5 6.97zm-17.86-5.9v-.13c0 .14 1.36-1.35 1.36-1.35V88.56l-3.86-3.52-.95 1 3.38 3.13v17.86l-2.1 2.17v.13l5.2 4.74 1.1-1.15-4.14-3.65zm14.15-.53V88.9l-4.2-3.8-1.1 1.16 3.87 3.45v19.7l5.07 4.74 1.08-1.15-4.73-4.27zm22.8 18.6c-2.24-2.43-3.8-3.45-5.9-3.45-2.23 0-4.8.9-7.44 2.3l-.2-.2 9.34-10.82-6.63-5.28-1.3 1.35-.53-.47 1.5-1.63V89.98l12.84-9.2 7.18 5.68 1.22-1.35.54.55-1.42 1.56v23.15c0 3.52 1.96 3.4 1.96 6.02 0 3.05-4.4 5.9-11.17 10.97zm-9-18.74V89.43l-1.43 1.02v18.74l6.7 5.4 1.02-1.2-6.3-4.8zm4.73 13.53c-2.23 0-4.26.68-6.16 1.96v.13c1.22-.48 2.57-1.1 4.88-1.1 2.1 0 3.85 1.16 5.6 3.2l1.37-1.1c-1.7-1.95-3.73-3.1-5.7-3.1zm8.26-6.5c-1.7-1.08-2.1-2.23-2.1-5.15v-1.22l-11.25 13 .07.07c2.17-1.3 4.13-2.24 7.3-2.24 2.72 0 4.54 1.08 6.64 2.84 1.3-1.1 2.17-2.17 2.17-3.86 0-1.35-1.15-2.37-2.84-3.45zM180 89.85l-5.63-4.4-.13.13v20.78l4.33 3.38 1.42-1.62V89.85zm4.12 24.83c-1.76-1.35-1.97-2.5-1.97-5.4V88.7l-5.68-4.6-1.3.87 5.56 4.4v20.84c0 2.92.14 3.74 1.96 4.95 1.62 1.08 2.9 2.23 2.9 3.8 0 .4-.06 1.07-.06 1.07l.06.07c.34-.35.75-.9.75-1.77 0-1.48-.75-2.5-2.23-3.65zm16.24.47l-6.23-4.87-1.3 1.43-.53-.54 1.5-1.62v-23.7h-3.93l-.14-.12 2.37-3.52h1.7v-4.53l7.16-7.65.14.13V82.2h5.4l.15.13-2.37 3.52h-3.18v20.5l3.72 2.85 1.42-1.56.54.55-6.42 6.95zm-4.4-38.64l-1.42 1.57v4.13h1.42v-5.7zm0 32.36v-23h-1.42v23.8l5.75 4.48 1-1.08-5.34-4.2zm32.96-21.05v18.96l-11.44 8.46-8.6-6.5-1 1.16-.56-.54 1.22-1.36V89.44l11.98-8.6 7.98 6.23 1.3-1.35.53.54-1.4 1.55v.02zm-18.2 19.5V88.84l-1.43 1.02v18.2l8.1 6.23 1.3-.95-8-6.02zm10.9-17.25l-5.7-4.47-.07.07v20.17l5.76 4.33v-20.1zm2.15-1.22l-6.3-4.87-1.28.95 6.15 4.74v21.05l1.42-1.1V88.84zm31 26.33l-5.9-5.35V90.06l-3.58-3.25-1.9 2.03v18.27l2.1 1.96v.27l-5.56 5.82-6.3-5.75v-.2l2.45-2.57V89.57l-3.2-2.9-1.28 1.4-.54-.53 6.16-6.7 6.15 5.5v1.34l6.42-6.84 6.03 5.42 1.35-1.42.6.54-1.62 1.7v19.34l3.05 2.85 1.5-1.63.53.54-6.5 6.97zm-17.87-5.9v-.13c0 .14 1.35-1.35 1.35-1.35V88.56l-3.85-3.52-.95 1 3.4 3.13v17.86l-2.1 2.17v.13l5.2 4.74 1.08-1.15-4.13-3.65zm14.14-.53V88.9l-4.2-3.8-1.07 1.16 3.85 3.45v19.7l5.08 4.74 1.08-1.15-4.74-4.27zM105.56 182.32c-1.9-2.03-2.57-2.7-4.8-3.32v13.27l-.14.06-2.7-2.44-7.86 7.24-.14-.07v-18.2c-2.36.54-4.87 1.76-7.3 4.2l-.14-.07c.47-5.22 3.04-9.07 7.44-10.56v-13.2H88.5c-1.96 0-3.05 1.02-4 2.85h-.2c-.2-.62-.33-1.57-.33-3.12 0-2.85 2.1-5.82 5.55-5.82h.4v-7.32l-2.77-2.5-1.36 1.35-.48-.48 6.83-7.24 6.36 5.82v6.16l1.5-1.63v-7.78h.74v7.1l9-9.67 7.65 6.97v29.44l-11.84 8.92zm-15.63-5.96c-2.78.54-4.74 2.24-6.02 4.47l.07.14c1.9-1.62 3.66-2.37 5.96-2.9v-1.7zm2.16-19.42h-3.73c-2.57 0-3.6 1.63-3.65 3.52l.06.07c.82-1.35 1.63-1.83 3.25-2.03h2.64v36.88l1.42-1.35v-37.1zm0-12.18l-3.4-3.1-1.07 1.14 3.04 2.7v7.65h1.42v-8.4zm7.9 3.86l-1.48 1.55v39.2l1.5 1.34v-42.08zm8.88-1.56l-4-3.65-4.12 4.48v23.82c3.52.34 5.95 1.35 8.05 3.65l.07-.07v-28.22zm-8.12 29.3v1.77c2.3.54 3.24 1.22 4.87 3.1l1.55-1.2c-1.62-2.04-3.24-3.26-6.42-3.67zm10.28-30.38l-4.6-4.27-1.1 1.16 4.28 3.93v31.46l1.42-1.08v-31.2zm29.98 9v18.95l-11.44 8.46-8.6-6.5-1 1.14-.55-.54 1.22-1.36V156.6l11.97-8.6 8 6.24 1.27-1.36.53.54-1.4 1.56zm-18.2 19.5v-18.5l-1.43 1.03v18.2l8.1 6.24 1.3-.94-8-6.03zm10.9-17.27l-5.7-4.46-.06.07V173l5.75 4.33v-20.1zm2.15-1.2l-6.3-4.88-1.27.95 6.15 4.74v21.05l1.42-1.08V156zm20.84 4.26h7v14.48l-10.77 7.58c-1.43-1.63-3.46-2.78-5.96-2.78-2.03 0-3.65.68-5.82 2.5l-.34-.13 9.22-13.38h-5.7v-12.6l11.3-7.85c1.37 1.1 2.38 1.7 4.14 1.7 1.35 0 3.3-.28 4.8-1.5l.28.14-8.13 11.86zm-7.56 17.32c-1.97 0-3.52.68-4.88 1.63v.15c1.1-.48 2.17-.6 3.18-.6 1.56 0 4 .73 5.62 2.63l1.56-1.15c-1.3-1.5-2.85-2.65-5.48-2.65zm7.24-11.37h-3.8l-7.9 11.45.05.14c1.97-1.44 3.86-2.38 6.16-2.38 2.02 0 4.12.94 5.4 2.57l.07-.08v-11.7zm-9.88 0v-10.88l-1.43 1v11.38h5.48l1.1-1.5h-5.15zm14.14-13.25c-1.42 1.02-2.23 1.56-4.6 1.56-1.36 0-2.98-.6-4.27-1.96l-.14.07v9.9h2.57l6.5-9.48-.06-.07zm-3.25-.6c-1.23 0-2.78-.62-3.8-1.97l-1.7 1.15c1.3 1.55 2.52 2.23 4.4 2.23 1.23 0 3-.34 4.8-1.9l.08-.2c-1.15.48-2.57.68-3.8.68zm1.14 11.63h-4.4l-1 1.42h3.98v13.13l1.42-1v-13.55zm16.38 18.34l-6.23-4.87-1.3 1.42-.54-.54 1.5-1.63v-23.68h-3.93l-.14-.14 2.36-3.52h1.7v-4.53l7.16-7.65.14.14v12.04h5.4l.15.14-2.36 3.52h-3.18v20.5l3.72 2.85 1.42-1.56.54.55-6.42 6.97zm-4.4-38.64l-1.42 1.55v4.13h1.42v-5.68zm0 32.35v-23h-1.42v23.8l5.75 4.47 1.02-1.08-5.35-4.2z") + + symbol#svg_venturebeat(viewBox="0 0 1743 222.2") + path(d="M208 0v44.4c-3.5 0-6.5.4-9.4-.1-4.1-.8-5.5.9-6.6 4.5-13.9 45-28 89.9-42 134.8-3.2 10.3-6.3 20.7-9.8 30.9-.5 1.4-2.5 3.3-3.8 3.3-22.5.2-45 .1-67.8.1-.5-1.4-1.1-2.7-1.6-4.1-17.4-55-34.8-110.1-52-165.1-1.2-3.7-2.7-5.1-6.7-4.5-2.6.5-5.5-.1-8.3-.2V0h94v44.3H74.9c10.5 41.1 20.9 81.7 31.3 122.3.3 0 .6.1 1 .1 11.2-40.6 22.4-81.3 33.8-122.5h-18.9V0H208z M356 58.3h63.2c.6 7.4 1.2 14.7 1.9 22.2 3.8-4.4 7-8.9 11-12.4 17.9-15.4 38.5-18.4 60.2-10.4 16.4 6.1 23.4 19.6 23.7 36.5.4 24.1.2 48.3.2 72.5v6.6l12.9.6v43.7h-70.8V212v-92.5c0-8.4-2.9-12.7-9.3-14.8-6.7-2.2-13.6 0-18.2 6-1.1 1.4-1.9 3.1-2.7 4.8-.5 1.2-1 2.6-1 3.8-.1 17.9 0 35.8 0 54.2h9.7v44.1H356v-43.9h12.3v-70.8h-12.2c-.1-15.2-.1-29.7-.1-44.6zM741.9 102.4h-10.8v-44c.8-.1 1.6-.3 2.4-.3h66.6v115.6H813v43.9h-65.5v-16.5c-2.9 3.1-4.9 5.4-7.2 7.5-15.9 14.1-43.9 17.9-62.9 8.3-14.5-7.3-21.7-19.7-22.3-35.4-.9-24.3-.7-48.6-1-72.9v-6.3h-12.7v-44H712v5.6c0 29.3-.1 58.6.1 88 0 4.1.7 8.3 2 12.2 2 5.9 7 8.9 13.2 8.7 6.1-.2 10.5-3.1 12.6-8.8.8-2.2 1.7-4.5 1.7-6.7.4-18.1.3-36.3.3-54.9z M345.7 149h-98.5c-.2 9.1.1 17.6 4.5 25.4 3.6 6.5 9.6 8.9 16.8 8.6 7.2-.3 12.9-3.3 15.9-10.1 1.3-3 2.1-6.2 3.3-9.6h54.6c-2.2 17.5-8.9 32.3-22.9 43.3-9.9 7.8-21.4 12-33.8 13.8-16.7 2.5-33.2 1.8-49.4-3.4-21.7-7-36.3-21.4-43-43-7.3-23.3-7.6-47 .1-70.3 9.4-28.7 30.1-44.2 59.5-48.6 13.2-2 26.3-1.1 39.1 2.4 29.9 8.1 45.9 28.7 50.8 58.4 1.8 10.6 2 21.5 3 33.1zm-96.9-30.8H287c.5-8.5-.7-16.1-8.2-20.9-6.8-4.3-14.3-4.7-21.2-.4-7.7 4.9-8.7 12.8-8.8 21.3zM1114 148.9h-98.2c-.2 9-.2 17.6 4.3 25.4 3.8 6.7 9.9 9.1 17.3 8.7 7.4-.4 13.1-3.8 15.9-10.9 1.1-2.8 1.8-5.7 2.8-8.8h54.7c-3.5 32.1-26 53.9-59.4 57.6-15.6 1.7-30.9 1-46-3.7-22.3-7-37.2-21.7-44-44-6.9-23-7.2-46.3.3-69.3 9.5-28.9 30.3-44.5 59.9-48.8 13.9-2 27.6-.9 41 3.1 27.5 8.3 43 27.6 48.1 55.2 2.1 11.4 2.2 23.1 3.3 35.5zm-96.4-30.8h38c.1-16-7.7-24.9-20.6-23.9-11.9.9-19.2 11-17.4 23.9z M535.6 58h18c0-10.6.4-20.9-.1-31.2-.3-5.4 1.5-7.4 6.8-8.5 15.2-3.1 30.2-6.7 46-10.3v50h25.6v44.7h-25c-.2 1.8-.4 3.3-.4 4.7v51.5c0 1.8.2 3.7.4 5.5 1.3 9.8 8.2 14.9 18 13.3 1.6-.3 3.2-.6 5.6-1v27.7c0 12.9 0 12.9-12.7 14.9-13.6 2.2-27.1 2.9-40.7-.3-19.1-4.6-27.8-15.5-27.9-35.3V103h-13.7c.1-15.3.1-29.8.1-45zM826.2 217.6v-43.9h12.7v-70.9h-12.6V58.3h62.1l1.9 25.3 2-4.4c5.1-12.9 14.4-20.7 28.3-22.2 6.7-.7 13.6-.1 20.3.3 1.2.1 3.4 2 3.4 3.1.2 15.8 0 31.6 0 47.5 0 .3-.3.6-.6 1.1-7.6 0-15.5-1-23.1.2-16.2 2.6-23.8 12-24.5 28.5-.2 5.8-.2 11.7-.3 17.5v18.2h18v44.3c-29.1-.1-58.1-.1-87.6-.1z") + path(fill="#ED1E25" d="M1237 .3c8.5 1.4 17.1 2.2 25.4 4.3 34.3 8.6 51.7 50.6 33.5 80.3-4.4 7.2-10.5 12.4-17.7 16.5-3.2 1.8-6.4 3.5-10.3 5.5 2 .8 3.4 1.6 4.9 2 23.7 6.9 34.2 24.4 35.9 47.6 2.4 31.9-17.7 55.7-49.6 59.6-9.9 1.2-19.9 1.9-29.9 1.9-31.7.2-63.3.1-95 .1h-5.8v-43.8h18.9V44.4H1128V.2c36.3.1 72.7.1 109 .1zm-32.3 128.8c0 14.9-.1 28.5.1 42.2 0 .9 2 2.7 3 2.7 8.3 0 16.7 0 24.9-.7 6.1-.5 11.7-2.8 15.1-8.4 8-13.2.4-31.6-14.7-34.2-9-1.6-18.4-1.1-28.4-1.6zm.2-40.5c8.7-.5 16.9-.2 24.8-1.6 9.6-1.7 16.2-11 16.3-21.2 0-10.2-5.9-19.7-14.7-21.3-8.5-1.5-17.4-1.4-26.4-2v46.1z M1743 103.3c-7.5-.1-15-.4-22.4-.2-1.1 0-3.2 1.9-3.2 3-.2 18.8-.6 37.7.1 56.5.4 12.3 7.9 17.4 20 15.2 1-.2 2-.2 3.2-.3.2 1.2.5 2.3.5 3.4 0 10.8 0 21.7.1 32.5 0 2.4-.3 4.2-3.1 4.7-16.5 2.7-32.9 5.1-49.6 1.2-18.7-4.4-27.7-14.3-28.1-33.4-.5-25.5-.2-51-.3-76.5V103h-6.4c-8.3-.1-7.3.9-7.4-7.6V58.5h18.4c0-10.1-.1-19.8 0-29.4.1-10.6-1.5-8.2 8.7-10.7 14.2-3.4 28.5-6.5 43.5-10v49.9h26v45z M1569.2 119.2c0-5.4.3-10-.1-14.6-.6-8.5-6.1-14.1-13.8-14.3-7.7-.2-14.1 5.5-15.3 13.7-.3 1.8-.3 3.6-.5 5.8h-53.3c-1.9-20.2 8.6-38.7 28.2-47.2 28.5-12.3 57.2-11.2 85.1 2.2 17.1 8.2 25.9 22.7 26.2 41.7.4 20.3.2 40.7.3 61v6.6h12.8v43.8h-66.2c-.5-5.4-1-11-1.6-17.4-1.5 1.7-2.5 2.7-3.4 3.8-17.3 21.3-50.3 21.2-67.2 11.3-13.4-7.9-19.2-20.5-20.1-35.4-2-32.6 15.1-53.7 48.1-58.7 11.6-1.8 23.5-1.6 35.3-2.3 1.6-.1 3.2 0 5.5 0zm.7 28.2c-5.4 0-9.7-.6-13.9.1-12.9 2.1-19.5 11.1-18.1 24.1 1.2 10.7 10.4 16.1 20.3 11.9 5.3-2.2 8.9-6.3 9.7-11.8 1.2-7.9 1.4-16 2-24.3z M1475.6 149.2h-98.5c0 9.7.1 18.9 5.6 27 4.2 6.2 10.6 7.7 17.6 7 6.8-.7 11.9-4.1 14.6-10.5 1.2-2.7 1.8-5.7 2.8-9h54.4c-2.2 17.5-8.9 32.5-23.3 43.3-17 12.8-36.8 15.8-57.3 14.4-8.4-.5-16.9-2-25-4.5-21.4-6.5-36-20.6-42.8-41.9-8-25-8.2-50.2 1.1-74.9 10.3-27.1 31.1-41 59.2-44.8 13.7-1.8 27.3-.7 40.5 3.4 28.2 8.7 43.2 28.8 47.9 57 2.1 10.8 2.3 21.8 3.2 33.5zm-58.1-30.5c.1-9-.9-17.2-9.5-21.8-7.3-3.9-14.9-4-21.6 1.2-6.6 5.1-7.8 12.5-7.3 20.6h38.4z") + + symbol#svg_microsoft(viewBox="0 0 609 130") + path(fill="#737373" d="M213.2 74.3l-3.6 10.2h-.3c-.6-2.3-1.7-5.8-3.5-10L186.5 26h-18.9v77.3h12.5V55.6c0-3 0-6.4-.1-10.6-.1-2.1-.3-3.7-.4-4.9h.3c.6 3 1.3 5.2 1.8 6.6l23.2 56.4h8.8l23-56.9c.5-1.3 1-3.9 1.5-6.1h.3c-.3 5.7-.5 10.8-.6 13.9v49h13.3V25.8H233l-19.8 48.5zm50.6-26.7h13V103h-13zm6.6-23.4c-2.2 0-4 .8-5.5 2.2-1.5 1.4-2.3 3.2-2.3 5.4 0 2.1.8 3.9 2.3 5.3 1.5 1.4 3.3 2.1 5.5 2.1s4.1-.8 5.5-2.1c1.5-1.4 2.3-3.2 2.3-5.3s-.8-3.9-2.3-5.4c-1.3-1.4-3.2-2.2-5.5-2.2m52.5 22.9c-2.4-.5-4.9-.8-7.3-.8-5.9 0-11.3 1.3-15.8 3.9-4.5 2.6-8.1 6.2-10.4 10.7-2.4 4.6-3.6 9.9-3.6 16 0 5.3 1.2 10 3.5 14.3 2.3 4.2 5.5 7.6 9.8 9.9 4.1 2.3 8.9 3.5 14.3 3.5 6.2 0 11.5-1.3 15.7-3.7l.1-.1v-12l-.5.4c-1.9 1.4-4.1 2.6-6.3 3.3-2.3.8-4.4 1.2-6.2 1.2-5.2 0-9.3-1.5-12.2-4.8-3-3.2-4.5-7.6-4.5-13.1 0-5.7 1.5-10.2 4.6-13.5 3.1-3.3 7.2-5 12.2-5 4.2 0 8.5 1.4 12.4 4.2l.5.4V49.2l-.1-.1c-1.7-.7-3.6-1.5-6.2-2m42.9-.4c-3.2 0-6.2 1-8.8 3.1-2.2 1.8-3.7 4.4-5 7.5h-.1v-9.7h-13V103h13V74.7c0-4.8 1-8.8 3.2-11.7 2.2-3 5-4.5 8.4-4.5 1.2 0 2.4.3 3.9.5 1.4.4 2.4.8 3.1 1.3l.5.4v-13l-.3-.1c-.9-.6-2.7-.9-4.9-.9m35.4-.3c-9.1 0-16.4 2.7-21.5 8-5.2 5.3-7.7 12.6-7.7 21.8 0 8.6 2.6 15.6 7.6 20.7 5 5 11.8 7.6 20.3 7.6 8.9 0 16-2.7 21.1-8.1 5.2-5.4 7.7-12.6 7.7-21.5 0-8.8-2.4-15.8-7.3-20.9-4.7-5.1-11.6-7.6-20.2-7.6M411.6 89c-2.4 3.1-6.2 4.6-10.9 4.6s-8.5-1.5-11.2-4.8c-2.7-3.1-4-7.6-4-13.3 0-5.9 1.4-10.4 4-13.6 2.7-3.2 6.4-4.8 11.1-4.8 4.6 0 8.2 1.5 10.8 4.6 2.6 3.1 4 7.6 4 13.5-.2 6-1.3 10.7-3.8 13.8m46.1-18.4c-4.1-1.7-6.7-3-7.9-4.1-1-1-1.5-2.4-1.5-4.2 0-1.5.6-3 2.1-4s3.2-1.5 5.7-1.5c2.2 0 4.5.4 6.7 1s4.2 1.5 5.8 2.7l.5.4V48.7l-.3-.1c-1.5-.6-3.5-1.2-5.9-1.7-2.4-.4-4.6-.6-6.4-.6-6.2 0-11.3 1.5-15.3 4.8-4 3.1-5.9 7.3-5.9 12.2 0 2.6.4 4.9 1.3 6.8.9 1.9 2.2 3.7 4 5.2 1.8 1.4 4.4 3 8 4.5 3 1.3 5.3 2.3 6.7 3.1 1.4.8 2.3 1.7 3 2.4.5.8.8 1.8.8 3.1 0 3.7-2.8 5.5-8.5 5.5-2.2 0-4.5-.4-7.2-1.3s-5.2-2.2-7.3-3.7l-.5-.4v12.7l.3.1c1.9.9 4.2 1.5 7 2.2 2.8.5 5.3.9 7.5.9 6.7 0 12.2-1.5 16.1-4.8 4-3.2 6.1-7.3 6.1-12.6 0-3.7-1-7-3.2-9.5-2.9-2.4-6.5-4.9-11.7-6.9m49.2-24.2c-9.1 0-16.4 2.7-21.5 8s-7.7 12.6-7.7 21.8c0 8.6 2.6 15.6 7.6 20.7 5 5 11.8 7.6 20.3 7.6 8.9 0 16-2.7 21.1-8.1 5.2-5.4 7.7-12.6 7.7-21.5 0-8.8-2.4-15.8-7.3-20.9-4.7-5.1-11.6-7.6-20.2-7.6M517.2 89c-2.4 3.1-6.2 4.6-10.9 4.6-4.8 0-8.5-1.5-11.2-4.8-2.7-3.1-4-7.6-4-13.3 0-5.9 1.4-10.4 4-13.6 2.7-3.2 6.4-4.8 11.1-4.8 4.5 0 8.2 1.5 10.8 4.6 2.6 3.1 4 7.6 4 13.5 0 6-1.3 10.7-3.8 13.8M603.9 58.3V47.6h-13.1V31.2l-.4.1L578 35l-.3.1v12.5h-19.6v-7c0-3.2.8-5.7 2.2-7.3s3.5-2.4 6.1-2.4c1.8 0 3.7.4 5.8 1.3l.5.3V21.2l-.3-.1c-1.8-.6-4.2-1-7.3-1-3.9 0-7.3.9-10.4 2.4-3.1 1.7-5.4 4-7.1 7.1-1.7 3-2.6 6.4-2.6 10.3v7.7h-9.1v10.6h9.1V103h13.1V58.3h19.6v28.5c0 11.7 5.5 17.6 16.5 17.6 1.8 0 3.7-.3 5.5-.6 1.9-.4 3.3-.9 4.1-1.3l.1-.1V91.7l-.5.4c-.8.5-1.5.9-2.7 1.2-1 .3-1.9.4-2.6.4-2.6 0-4.4-.6-5.7-2.1-1.2-1.4-1.8-3.7-1.8-7.1V58.3h13.3z") + path(fill="#F25022" d="M0 0h61.3v61.3H0z") + path(fill="#7FBA00" d="M67.7 0H129v61.3H67.7z") + path(fill="#00A4EF" d="M0 67.7h61.3V129H0z") + path(fill="#FFB900" d="M67.7 67.7H129V129H67.7z") + + + //- Filters etc. + defs + radialGradient#gradient_allenai1(cx="75.721" cy="20.894" r="11.05" gradientUnits="userSpaceOnUse") + stop(offset=".3" stop-color="#FDEA65") + stop(offset="1" stop-color="#FCB431") + radialGradient#gradient_allenai2(cx="75.4" cy="42.297" r="82.993" gradientUnits="userSpaceOnUse") + stop(offset="0" stop-color="#3FA9D0") + stop(offset="1" stop-color="#183A74") diff --git a/website/_layout.jade b/website/_layout.jade index 482af35fa..49a025d96 100644 --- a/website/_layout.jade +++ b/website/_layout.jade @@ -2,11 +2,16 @@ include _includes/_mixins +- title = IS_MODELS ? LANGUAGES[current.source] || title : title +- social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME +- social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg" + doctype html html(lang="en") title - if SECTION == "docs" && SUBSECTION && SUBSECTION != "index" - | #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation + if SECTION == "api" || SECTION == "usage" || SECTION == "models" + - var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1) + | #{title} | #{SITENAME} #{title_section} Documentation else if SECTION != "index" | #{title} | #{SITENAME} @@ -22,32 +27,30 @@ html(lang="en") meta(property="og:type" content="website") meta(property="og:site_name" content=sitename) meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}") - meta(property="og:title" content="#{title} - spaCy") + meta(property="og:title" content=social_title) meta(property="og:description" content=description) - meta(property="og:image" content=getSocialImg()) + meta(property="og:image" content=social_img) meta(name="twitter:card" content="summary_large_image") meta(name="twitter:site" content="@" + SOCIAL.twitter) - meta(name="twitter:title" content="#{title} - spaCy") + meta(name="twitter:title" content=social_title) meta(name="twitter:description" content=description) - meta(name="twitter:image" content=getSocialImg()) + meta(name="twitter:image" content=social_img) link(rel="shortcut icon" href="/assets/img/favicon.ico") link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico") - if ALPHA && SECTION == "docs" + if SECTION == "api" link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet") - else if SUBSECTION == "usage" - link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet") - else link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet") body + include _includes/_svg include _includes/_navigation - if SECTION == "docs" + if !landing include _includes/_page-docs else diff --git a/website/api/_annotation/_biluo.jade b/website/api/_annotation/_biluo.jade new file mode 100644 index 000000000..34d93f768 --- /dev/null +++ b/website/api/_annotation/_biluo.jade @@ -0,0 +1,43 @@ +//- 💫 DOCS > API > ANNOTATION > BILUO + ++table(["Tag", "Description"]) + +row + +cell #[code #[span.u-color-theme B] EGIN] + +cell The first token of a multi-token entity. + + +row + +cell #[code #[span.u-color-theme I] N] + +cell An inner token of a multi-token entity. + + +row + +cell #[code #[span.u-color-theme L] AST] + +cell The final token of a multi-token entity. + + +row + +cell #[code #[span.u-color-theme U] NIT] + +cell A single-token entity. + + +row + +cell #[code #[span.u-color-theme O] UT] + +cell A non-entity token. + ++aside("Why BILUO, not IOB?") + | There are several coding schemes for encoding entity annotations as + | token tags. These coding schemes are equally expressive, but not + | necessarily equally learnable. + | #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth] + | showed that the minimal #[strong Begin], #[strong In], #[strong Out] + | scheme was more difficult to learn than the #[strong BILUO] scheme that + | we use, which explicitly marks boundary tokens. + +p + | spaCy translates the character offsets into this scheme, in order to + | decide the cost of each action given the current state of the entity + | recogniser. The costs are then used to calculate the gradient of the + | loss, to train the model. The exact algorithm is a pastiche of + | well-known methods, and is not currently described in any single + | publication. The model is a greedy transition-based parser guided by a + | linear model whose weights are learned using the averaged perceptron + | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] + | imitation learning strategy. The transition system is equivalent to the + | BILOU tagging scheme. diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/api/_annotation/_dep-labels.jade similarity index 100% rename from website/docs/api/_annotation/_dep-labels.jade rename to website/api/_annotation/_dep-labels.jade diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/api/_annotation/_named-entities.jade similarity index 57% rename from website/docs/api/_annotation/_named-entities.jade rename to website/api/_annotation/_named-entities.jade index 476659d4a..4cc8a707f 100644 --- a/website/docs/api/_annotation/_named-entities.jade +++ b/website/api/_annotation/_named-entities.jade @@ -1,6 +1,11 @@ //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES -+table([ "Type", "Description" ]) +p + | Models trained on the + | #[+a("https://catalog.ldc.upenn.edu/ldc2013t19") OntoNotes 5] corpus + | support the following entity types: + ++table(["Type", "Description"]) +row +cell #[code PERSON] +cell People, including fictional. @@ -37,13 +42,14 @@ +cell #[code WORK_OF_ART] +cell Titles of books, songs, etc. + +row + +cell #[code LAW] + +cell Named documents made into laws. + +row +cell #[code LANGUAGE] +cell Any named language. -p The following values are also annotated in a style similar to names: - -+table([ "Type", "Description" ]) +row +cell #[code DATE] +cell Absolute or relative dates or periods. @@ -71,3 +77,33 @@ p The following values are also annotated in a style similar to names: +row +cell #[code CARDINAL] +cell Numerals that do not fall under another type. + ++h(4, "ner-wikipedia-scheme") Wikipedia scheme + +p + | Models trained on Wikipedia corpus + | (#[+a("http://www.sciencedirect.com/science/article/pii/S0004370212000276") Nothman et al., 2013]) + | use a less fine-grained NER annotation scheme and recognise the + | following entities: + ++table(["Type", "Description"]) + +row + +cell #[code PER] + +cell Named person or family. + + +row + +cell #[code LOC] + +cell + | Name of politically or geographically defined location (cities, + | provinces, countries, international regions, bodies of water, + | mountains). + + +row + +cell #[code ORG] + +cell Named corporate, governmental, or other organizational entity. + + +row + +cell #[code MISC] + +cell + | Miscellaneous entities, e.g. events, nationalities, products or + | works of art. diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/api/_annotation/_pos-tags.jade similarity index 100% rename from website/docs/api/_annotation/_pos-tags.jade rename to website/api/_annotation/_pos-tags.jade diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade new file mode 100644 index 000000000..9bd59cdae --- /dev/null +++ b/website/api/_annotation/_training.jade @@ -0,0 +1,104 @@ +//- 💫 DOCS > API > ANNOTATION > TRAINING + ++h(3, "json-input") JSON input format for training + +p + | spaCy takes training data in JSON format. The built-in + | #[+api("cli#convert") #[code convert]] command helps you convert the + | #[code .conllu] format used by the + | #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora] + | to spaCy's training format. + ++aside("Annotating entities") + | Named entities are provided in the #[+a("/api/annotation#biluo") BILUO] + | notation. Tokens outside an entity are set to #[code "O"] and tokens + | that are part of an entity are set to the entity label, prefixed by the + | BILUO marker. For example #[code "B-ORG"] describes the first token of + | a multi-token #[code ORG] entity and #[code "U-PERSON"] a single + | token representing a #[code PERSON] entity. The + | #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]] + | function can help you convert entity offsets to the right format. + ++code("Example structure"). + [{ + "id": int, # ID of the document within the corpus + "paragraphs": [{ # list of paragraphs in the corpus + "raw": string, # raw text of the paragraph + "sentences": [{ # list of sentences in the paragraph + "tokens": [{ # list of tokens in the sentence + "id": int, # index of the token in the document + "dep": string, # dependency label + "head": int, # offset of token head relative to token index + "tag": string, # part-of-speech tag + "orth": string, # verbatim text of the token + "ner": string # BILUO label, e.g. "O" or "B-ORG" + }], + "brackets": [{ # phrase structure (NOT USED by current models) + "first": int, # index of first token + "last": int, # index of last token + "label": string # phrase label + }] + }] + }] + }] + +p + | Here's an example of dependencies, part-of-speech tags and names + | entities, taken from the English Wall Street Journal portion of the Penn + | Treebank: + ++github("spacy", "examples/training/training-data.json", false, false, "json") + ++h(3, "vocab-jsonl") Lexical data for vocabulary + +tag-new(2) + +p + | The populate a model's vocabulary, you can use the + | #[+api("cli#vocab") #[code spacy vocab]] command and load in a + | #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON] + | (JSONL) file containing one lexical entry per line. The first line + | defines the language and vocabulary settings. All other lines are + | expected to be JSON objects describing an individual lexeme. The lexical + | attributes will be then set as attributes on spaCy's + | #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab] + | command outputs a ready-to-use spaCy model with a #[code Vocab] + | containing the lexical data. + ++code("First line"). + {"lang": "en", "settings": {"oov_prob": -20.502029418945312}} + ++code("Entry structure"). + { + "orth": string, + "id": int, + "lower": string, + "norm": string, + "shape": string + "prefix": string, + "suffix": string, + "length": int, + "cluster": string, + "prob": float, + "is_alpha": bool, + "is_ascii": bool, + "is_digit": bool, + "is_lower": bool, + "is_punct": bool, + "is_space": bool, + "is_title": bool, + "is_upper": bool, + "like_url": bool, + "like_num": bool, + "like_email": bool, + "is_stop": bool, + "is_oov": bool, + "is_quote": bool, + "is_left_punct": bool, + "is_right_punct": bool + } + +p + | Here's an example of the 20 most frequent lexemes in the English + | training data: + ++github("spacy", "examples/training/vocab-data.jsonl", false, false, "json") diff --git a/website/api/_architecture/_cython.jade b/website/api/_architecture/_cython.jade new file mode 100644 index 000000000..84b98b824 --- /dev/null +++ b/website/api/_architecture/_cython.jade @@ -0,0 +1,115 @@ +//- 💫 DOCS > API > ARCHITECTURE > CYTHON + ++aside("What's Cython?") + | #[+a("http://cython.org/") Cython] is a language for writing + | C extensions for Python. Most Python code is also valid Cython, but + | you can add type declarations to get efficient memory-managed code + | just like C or C++. + +p + | spaCy's core data structures are implemented as + | #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is + | managed through the #[+a(gh("cymem")) #[code cymem]] + | #[code cymem.Pool] class, which allows you + | to allocate memory which will be freed when the #[code Pool] object + | is garbage collected. This means you usually don't have to worry + | about freeing memory. You just have to decide which Python object + | owns the memory, and make it own the #[code Pool]. When that object + | goes out of scope, the memory will be freed. You do have to take + | care that no pointers outlive the object that owns them — but this + | is generally quite easy. + +p + | All Cython modules should have the #[code # cython: infer_types=True] + | compiler directive at the top of the file. This makes the code much + | cleaner, as it avoids the need for many type declarations. If + | possible, you should prefer to declare your functions #[code nogil], + | even if you don't especially care about multi-threading. The reason + | is that #[code nogil] functions help the Cython compiler reason about + | your code quite a lot — you're telling the compiler that no Python + | dynamics are possible. This lets many errors be raised, and ensures + | your function will run at C speed. + + +p + | Cython gives you many choices of sequences: you could have a Python + | list, a numpy array, a memory view, a C++ vector, or a pointer. + | Pointers are preferred, because they are fastest, have the most + | explicit semantics, and let the compiler check your code more + | strictly. C++ vectors are also great — but you should only use them + | internally in functions. It's less friendly to accept a vector as an + | argument, because that asks the user to do much more work. Here's + | how to get a pointer from a numpy array, memory view or vector: + ++code. + cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil: + pointer1 = <int*>numpy_array.data + pointer2 = cpp_vector.data() + pointer3 = &memory_view[0] + +p + | Both C arrays and C++ vectors reassure the compiler that no Python + | operations are possible on your variable. This is a big advantage: + | it lets the Cython compiler raise many more errors for you. + +p + | When getting a pointer from a numpy array or memoryview, take care + | that the data is actually stored in C-contiguous order — otherwise + | you'll get a pointer to nonsense. The type-declarations in the code + | above should generate runtime errors if buffers with incorrect + | memory layouts are passed in. To iterate over the array, the + | following style is preferred: + ++code. + cdef int c_total(const int* int_array, int length) nogil: + total = 0 + for item in int_array[:length]: + total += item + return total + +p + | If this is confusing, consider that the compiler couldn't deal with + | #[code for item in int_array:] — there's no length attached to a raw + | pointer, so how could we figure out where to stop? The length is + | provided in the slice notation as a solution to this. Note that we + | don't have to declare the type of #[code item] in the code above — + | the compiler can easily infer it. This gives us tidy code that looks + | quite like Python, but is exactly as fast as C — because we've made + | sure the compilation to C is trivial. + +p + | Your functions cannot be declared #[code nogil] if they need to + | create Python objects or call Python functions. This is perfectly + | okay — you shouldn't torture your code just to get #[code nogil] + | functions. However, if your function isn't #[code nogil], you should + | compile your module with #[code cython -a --cplus my_module.pyx] and + | open the resulting #[code my_module.html] file in a browser. This + | will let you see how Cython is compiling your code. Calls into the + | Python run-time will be in bright yellow. This lets you easily see + | whether Cython is able to correctly type your code, or whether there + | are unexpected problems. + +p + | Working in Cython is very rewarding once you're over the initial + | learning curve. As with C and C++, the first way you write something + | in Cython will often be the performance-optimal approach. In + | contrast, Python optimisation generally requires a lot of + | experimentation. Is it faster to have an #[code if item in my_dict] + | check, or to use #[code .get()]? What about + | #[code try]/#[code except]? Does this numpy operation create a copy? + | There's no way to guess the answers to these questions, and you'll + | usually be dissatisfied with your results — so there's no way to + | know when to stop this process. In the worst case, you'll make a + | mess that invites the next reader to try their luck too. This is + | like one of those + | #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps], + | where the rescuers keep passing out from low oxygen, causing + | another rescuer to follow — only to succumb themselves. In short, + | just say no to optimizing your Python. If it's not fast enough the + | first time, just switch to Cython. + ++infobox("Resources") + +list.o-no-block + +item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org) + +item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai) + +item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai) diff --git a/website/api/_architecture/_nn-model.jade b/website/api/_architecture/_nn-model.jade new file mode 100644 index 000000000..8080af2ec --- /dev/null +++ b/website/api/_architecture/_nn-model.jade @@ -0,0 +1,141 @@ +//- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE + +p + | The parsing model is a blend of recent results. The two recent + | inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at + | Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of + | the parser is still based on the work of Joakim Nivre#[+fn(2)], who + | introduced the transition-based framework#[+fn(3)], the arc-eager + | transition system, and the imitation learning objective. The model is + | implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning + | library. We first predict context-sensitive vectors for each word in the + | input: + ++code. + (embed_lower | embed_prefix | embed_suffix | embed_shape) + >> Maxout(token_width) + >> convolution ** 4 + +p + | This convolutional layer is shared between the tagger, parser and NER, + | and will also be shared by the future neural lemmatizer. Because the + | parser shares these layers with the tagger, the parser does not require + | tag features. I got this trick from David Weiss's "Stack Combination" + | paper#[+fn(4)]. + +p + | To boost the representation, the tagger actually predicts a "super tag" + | with POS, morphology and dependency label#[+fn(5)]. The tagger predicts + | these supertags by adding a softmax layer onto the convolutional layer – + | so, we're teaching the convolutional layer to give us a representation + | that's one affine transform from this informative lexical information. + | This is obviously good for the parser (which backprops to the + | convolutions too). The parser model makes a state vector by concatenating + | the vector representations for its context tokens. The current context + | tokens: + ++table + +row + +cell #[code S0], #[code S1], #[code S2] + +cell Top three words on the stack. + + +row + +cell #[code B0], #[code B1] + +cell First two words of the buffer. + + +row + +cell.u-nowrap + | #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1], + | #[code B1L1]#[br] + | #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2], + | #[code B1L2] + +cell + | Leftmost and second leftmost children of #[code S0], #[code S1], + | #[code S2], #[code B0] and #[code B1]. + + +row + +cell.u-nowrap + | #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1], + | #[code B1R1]#[br] + | #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2], + | #[code B1R2] + +cell + | Rightmost and second rightmost children of #[code S0], #[code S1], + | #[code S2], #[code B0] and #[code B1]. + +p + | This makes the state vector quite long: #[code 13*T], where #[code T] is + | the token vector width (128 is working well). Fortunately, there's a way + | to structure the computation to save some expense (and make it more + | GPU-friendly). + +p + | The parser typically visits #[code 2*N] states for a sentence of length + | #[code N] (although it may visit more, if it back-tracks with a + | non-monotonic transition#[+fn(4)]). A naive implementation would require + | #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of + | size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)] + | multiplication, to pre-compute the hidden weights for each positional + | feature with respect to the words in the batch. (Note that our token + | vectors come from the CNN — so we can't play this trick over the + | vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its + | model is so big.) + +p + | This pre-computation strategy allows a nice compromise between + | GPU-friendliness and implementation simplicity. The CNN and the wide + | lower layer are computed on the GPU, and then the precomputed hidden + | weights are moved to the CPU, before we start the transition-based + | parsing process. This makes a lot of things much easier. We don't have to + | worry about variable-length batch sizes, and we don't have to implement + | the dynamic oracle in CUDA to train. + +p + | Currently the parser's loss function is multilabel log loss#[+fn(6)], as + | the dynamic oracle allows multiple states to be 0 cost. This is defined + | as follows, where #[code gZ] is the sum of the scores assigned to gold + | classes: + ++code. + (exp(score) / Z) - (exp(score) / gZ) + ++bibliography + +item + | #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations] + br + | Eliyahu Kiperwasser, Yoav Goldberg. (2016) + + +item + | #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing] + br + | Yoav Goldberg, Joakim Nivre (2012) + + +item + | #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python] + br + | Matthew Honnibal (2013) + + +item + | #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax] + br + | Yuan Zhang, David Weiss (2016) + + +item + | #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers] + br + | Anders Søgaard, Yoav Goldberg (2016) + + +item + | #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing] + br + | Matthew Honnibal, Mark Johnson (2015) + + +item + | #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks] + br + | Danqi Cheng, Christopher D. Manning (2014) + + +item + | #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques] + br + | Stefan Riezler et al. (2002) diff --git a/website/docs/api/_data.json b/website/api/_data.json similarity index 52% rename from website/docs/api/_data.json rename to website/api/_data.json index e413f200c..886404c99 100644 --- a/website/docs/api/_data.json +++ b/website/api/_data.json @@ -1,85 +1,79 @@ { "sidebar": { - "Introduction": { - "Facts & Figures": "./", - "Languages": "language-models", - "Annotation Specs": "annotation" + "Overview": { + "Architecture": "./", + "Annotation Specs": "annotation", + "Command Line": "cli", + "Functions": "top-level" }, - "Top-level": { - "spacy": "spacy", - "displacy": "displacy", - "Utility Functions": "util", - "Command line": "cli" - }, - "Classes": { + + "Containers": { "Doc": "doc", "Token": "token", "Span": "span", + "Lexeme": "lexeme" + }, + + "Pipeline": { "Language": "language", - "Tokenizer": "tokenizer", + "Pipe": "pipe", "Tensorizer": "tensorizer", "Tagger": "tagger", "DependencyParser": "dependencyparser", "EntityRecognizer": "entityrecognizer", "TextCategorizer": "textcategorizer", + "Tokenizer": "tokenizer", + "Lemmatizer": "lemmatizer", "Matcher": "matcher", - "Lexeme": "lexeme", + "PhraseMatcher": "phrasematcher" + }, + + "Other": { "Vocab": "vocab", "StringStore": "stringstore", "Vectors": "vectors", "GoldParse": "goldparse", - "GoldCorpus": "goldcorpus", - "Binder": "binder" + "GoldCorpus": "goldcorpus" } }, "index": { - "title": "Facts & Figures", - "next": "language-models" - }, - - "language-models": { - "title": "Languages", - "next": "philosophy" - }, - - "philosophy": { - "title": "Philosophy" - }, - - "spacy": { - "title": "spaCy top-level functions", - "source": "spacy/__init__.py", - "next": "displacy" - }, - - "displacy": { - "title": "displaCy", - "tag": "module", - "source": "spacy/displacy", - "next": "util" - }, - - "util": { - "title": "Utility Functions", - "source": "spacy/util.py", - "next": "cli" + "title": "Architecture", + "next": "annotation", + "menu": { + "Basics": "basics", + "Neural Network Model": "nn-model", + "Cython Conventions": "cython" + } }, "cli": { "title": "Command Line Interface", + "teaser": "Download, train and package models, and debug spaCy.", "source": "spacy/cli" }, + "top-level": { + "title": "Top-level Functions", + "menu": { + "spacy": "spacy", + "displacy": "displacy", + "Utility Functions": "util", + "Compatibility": "compat" + } + }, + "language": { "title": "Language", "tag": "class", + "teaser": "A text-processing pipeline.", "source": "spacy/language.py" }, "doc": { "title": "Doc", "tag": "class", + "teaser": "A container for accessing linguistic annotations.", "source": "spacy/tokens/doc.pyx" }, @@ -103,6 +97,7 @@ "vocab": { "title": "Vocab", + "teaser": "A storage class for vocabulary and other data shared across a language.", "tag": "class", "source": "spacy/vocab.pyx" }, @@ -115,10 +110,27 @@ "matcher": { "title": "Matcher", + "teaser": "Match sequences of tokens, based on pattern rules.", "tag": "class", "source": "spacy/matcher.pyx" }, + "phrasematcher": { + "title": "PhraseMatcher", + "teaser": "Match sequences of tokens, based on documents.", + "tag": "class", + "tag_new": 2, + "source": "spacy/matcher.pyx" + }, + + "pipe": { + "title": "Pipe", + "teaser": "Abstract base class defining the API for pipeline components.", + "tag": "class", + "tag_new": 2, + "source": "spacy/pipeline.pyx" + }, + "dependenyparser": { "title": "DependencyParser", "tag": "class", @@ -127,37 +139,52 @@ "entityrecognizer": { "title": "EntityRecognizer", + "teaser": "Annotate named entities on documents.", "tag": "class", "source": "spacy/pipeline.pyx" }, "textcategorizer": { "title": "TextCategorizer", + "teaser": "Add text categorization models to spaCy pipelines.", "tag": "class", + "tag_new": 2, "source": "spacy/pipeline.pyx" }, "dependencyparser": { "title": "DependencyParser", + "teaser": "Annotate syntactic dependencies on documents.", "tag": "class", "source": "spacy/pipeline.pyx" }, "tokenizer": { "title": "Tokenizer", + "teaser": "Segment text into words, punctuations marks etc.", "tag": "class", "source": "spacy/tokenizer.pyx" }, + "lemmatizer": { + "title": "Lemmatizer", + "teaser": "Assign the base forms of words.", + "tag": "class", + "source": "spacy/lemmatizer.py" + }, + "tagger": { "title": "Tagger", + "teaser": "Annotate part-of-speech tags on documents.", "tag": "class", "source": "spacy/pipeline.pyx" }, "tensorizer": { "title": "Tensorizer", + "teaser": "Add a tensor with position-sensitive meaning representations to a document.", "tag": "class", + "tag_new": 2, "source": "spacy/pipeline.pyx" }, @@ -169,23 +196,31 @@ "goldcorpus": { "title": "GoldCorpus", + "teaser": "An annotated corpus, using the JSON file format.", "tag": "class", + "tag_new": 2, "source": "spacy/gold.pyx" }, - "binder": { - "title": "Binder", - "tag": "class", - "source": "spacy/tokens/binder.pyx" - }, - "vectors": { "title": "Vectors", + "teaser": "Store, save and load word vectors.", "tag": "class", + "tag_new": 2, "source": "spacy/vectors.pyx" }, "annotation": { - "title": "Annotation Specifications" + "title": "Annotation Specifications", + "teaser": "Schemes used for labels, tags and training data.", + "menu": { + "Tokenization": "tokenization", + "Sentence Boundaries": "sbd", + "POS Tagging": "pos-tagging", + "Lemmatization": "lemmatization", + "Dependencies": "dependency-parsing", + "Named Entities": "named-entities", + "Models & Training": "training" + } } } diff --git a/website/api/_top-level/_compat.jade b/website/api/_top-level/_compat.jade new file mode 100644 index 000000000..dfd42c55f --- /dev/null +++ b/website/api/_top-level/_compat.jade @@ -0,0 +1,91 @@ +//- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY + +p + | All Python code is written in an + | #[strong intersection of Python 2 and Python 3]. This is easy in Cython, + | but somewhat ugly in Python. Logic that deals with Python or platform + | compatibility only lives in #[code spacy.compat]. To distinguish them from + | the builtin functions, replacement functions are suffixed with an + | undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the + | #[code six] and #[code ftfy] packages. + ++aside-code("Example"). + from spacy.compat import unicode_, json_dumps + + compatible_unicode = unicode_('hello world') + compatible_json = json_dumps({'key': 'value'}) + ++table(["Name", "Python 2", "Python 3"]) + +row + +cell #[code compat.bytes_] + +cell #[code str] + +cell #[code bytes] + + +row + +cell #[code compat.unicode_] + +cell #[code unicode] + +cell #[code str] + + +row + +cell #[code compat.basestring_] + +cell #[code basestring] + +cell #[code str] + + +row + +cell #[code compat.input_] + +cell #[code raw_input] + +cell #[code input] + + +row + +cell #[code compat.json_dumps] + +cell #[code ujson.dumps] with #[code .decode('utf8')] + +cell #[code ujson.dumps] + + +row + +cell #[code compat.path2str] + +cell #[code str(path)] with #[code .decode('utf8')] + +cell #[code str(path)] + ++h(3, "is_config") compat.is_config + +tag function + +p + | Check if a specific configuration of Python version and operating system + | matches the user's setup. Mostly used to display targeted error messages. + ++aside-code("Example"). + from spacy.compat import is_config + + if is_config(python2=True, windows=True): + print("You are using Python 2 on Windows.") + ++table(["Name", "Type", "Description"]) + +row + +cell #[code python2] + +cell bool + +cell spaCy is executed with Python 2.x. + + +row + +cell #[code python3] + +cell bool + +cell spaCy is executed with Python 3.x. + + +row + +cell #[code windows] + +cell bool + +cell spaCy is executed on Windows. + + +row + +cell #[code linux] + +cell bool + +cell spaCy is executed on Linux. + + +row + +cell #[code osx] + +cell bool + +cell spaCy is executed on OS X or macOS. + + +row("foot") + +cell returns + +cell bool + +cell Whether the specified configuration matches the user's platform. diff --git a/website/docs/api/displacy.jade b/website/api/_top-level/_displacy.jade similarity index 91% rename from website/docs/api/displacy.jade rename to website/api/_top-level/_displacy.jade index 59fcca3ca..a3d7240d6 100644 --- a/website/docs/api/displacy.jade +++ b/website/api/_top-level/_displacy.jade @@ -1,14 +1,12 @@ -//- 💫 DOCS > API > DISPLACY - -include ../../_includes/_mixins +//- 💫 DOCS > API > TOP-LEVEL > DISPLACY p | As of v2.0, spaCy comes with a built-in visualization suite. For more | info and examples, see the usage guide on - | #[+a("/docs/usage/visualizers") visualizing spaCy]. + | #[+a("/usage/visualizers") visualizing spaCy]. -+h(2, "serve") displacy.serve ++h(3, "displacy.serve") displacy.serve +tag method +tag-new(2) @@ -60,7 +58,7 @@ p +cell bool +cell | Don't parse #[code Doc] and instead, expect a dict or list of - | dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] + | dicts. #[+a("/usage/visualizers#manual-usage") See here] | for formats and examples. +cell #[code False] @@ -70,7 +68,7 @@ p +cell Port to serve visualization. +cell #[code 5000] -+h(2, "render") displacy.render ++h(3, "displacy.render") displacy.render +tag method +tag-new(2) @@ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization. +cell bool +cell | Don't parse #[code Doc] and instead, expect a dict or list of - | dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] + | dicts. #[+a("/usage/visualizers#manual-usage") See here] | for formats and examples. +cell #[code False] - +footrow + +row("foot") +cell returns +cell unicode +cell Rendered HTML markup. +cell -+h(2, "options") Visualizer options ++h(3, "displacy_options") Visualizer options p | The #[code options] argument lets you specify additional settings for | each visualizer. If a setting is not present in the options, the default | value will be used. -+h(3, "options-dep") Dependency Visualizer options ++h(4, "options-dep") Dependency Visualizer options +aside-code("Example"). options = {'compact': True, 'color': 'blue'} @@ -219,7 +217,7 @@ p +cell Distance between words in px. +cell #[code 175] / #[code 85] (compact) -+h(3, "options-ent") Named Entity Visualizer options ++h(4, "displacy_options-ent") Named Entity Visualizer options +aside-code("Example"). options = {'ents': ['PERSON', 'ORG', 'PRODUCT'], @@ -244,6 +242,6 @@ p p | By default, displaCy comes with colours for all - | #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy]. + | #[+a("/api/annotation#named-entities") entity types supported by spaCy]. | If you're using custom entity types, you can use the #[code colors] | setting to add your own colours for them. diff --git a/website/docs/api/spacy.jade b/website/api/_top-level/_spacy.jade similarity index 61% rename from website/docs/api/spacy.jade rename to website/api/_top-level/_spacy.jade index a45307378..c6b342011 100644 --- a/website/docs/api/spacy.jade +++ b/website/api/_top-level/_spacy.jade @@ -1,15 +1,13 @@ -//- 💫 DOCS > API > SPACY +//- 💫 DOCS > API > TOP-LEVEL > SPACY -include ../../_includes/_mixins - -+h(2, "load") spacy.load ++h(3, "spacy.load") spacy.load +tag function +tag-model p - | Load a model via its #[+a("/docs/usage/models#usage") shortcut link], + | Load a model via its #[+a("/usage/models#usage") shortcut link], | the name of an installed - | #[+a("/docs/usage/saving-loading#generating") model package], a unicode + | #[+a("/usage/training#models-generating") model package], a unicode | path or a #[code Path]-like object. spaCy will try resolving the load | argument in this order. If a model is loaded from a shortcut link or | package name, spaCy will assume it's a Python package and import it and @@ -38,25 +36,73 @@ p +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + | #[+a("/usage/processing-pipelines#disabling") disable]. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell A #[code Language] object with the loaded model. -+infobox("⚠️ Deprecation note") +p + | Essentially, #[code spacy.load()] is a convenience wrapper that reads + | the language ID and pipeline components from a model's #[code meta.json], + | initialises the #[code Language] class, loads in the model data and + | returns it. + ++code("Abstract example"). + cls = util.get_lang_class(lang) # get language for ID, e.g. 'en' + nlp = cls() # initialise the language + for name in pipeline: + component = nlp.create_pipe(name) # create each pipeline component + nlp.add_pipe(component) # add component to pipeline + nlp.from_disk(model_data_path) # load in model data + ++infobox("Deprecation note", "⚠️") .o-block | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | will also raise an error if no model could be loaded and never just | return an empty #[code Language] object. If you need a blank language, - | you need to import it explicitly (#[code from spacy.lang.en import English]) - | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + | you can use the new function #[+api("spacy#blank") #[code spacy.blank()]] + | or import the class explicitly, e.g. + | #[code from spacy.lang.en import English]. +code-new nlp = spacy.load('/model') +code-old nlp = spacy.load('en', path='/model') -+h(2, "info") spacy.info ++h(3, "spacy.blank") spacy.blank + +tag function + +tag-new(2) + +p + | Create a blank model of a given language class. This function is the + | twin of #[code spacy.load()]. + ++aside-code("Example"). + nlp_en = spacy.blank('en') + nlp_de = spacy.blank('de') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code] + | of the language class to load. + + +row + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/usage/processing-pipelines#disabling") disable]. + + +row("foot") + +cell returns + +cell #[code Language] + +cell An empty #[code Language] object of the appropriate subclass. + + ++h(4, "spacy.info") spacy.info +tag function p @@ -83,16 +129,16 @@ p +cell Print information as Markdown. -+h(2, "explain") spacy.explain ++h(3, "spacy.explain") spacy.explain +tag function p | Get a description for a given POS tag, dependency label or entity type. | For a list of available terms, see - | #[+src(gh("spacy", "spacy/glossary.py")) glossary.py]. + | #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]]. +aside-code("Example"). - spacy.explain('NORP') + spacy.explain(u'NORP') # Nationalities or religious or political groups doc = nlp(u'Hello world') @@ -107,41 +153,7 @@ p +cell unicode +cell Term to explain. - +footrow + +row("foot") +cell returns +cell unicode +cell The explanation, or #[code None] if not found in the glossary. - -+h(2, "set_factory") spacy.set_factory - +tag function - +tag-new(2) - -p - | Set a factory that returns a custom - | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] - | component. Factories are useful for creating stateful components, especially ones which depend on shared data. - -+aside-code("Example"). - def my_factory(vocab): - def my_component(doc): - return doc - return my_component - - spacy.set_factory('my_factory', my_factory) - nlp = Language(pipeline=['my_factory']) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code factory_id] - +cell unicode - +cell - | Unique name of factory. If added to a new pipeline, spaCy will - | look up the factory for this ID and use it to create the - | component. - - +row - +cell #[code factory] - +cell callable - +cell - | Callable that takes a #[code Vocab] object and returns a pipeline - | component. diff --git a/website/docs/api/util.jade b/website/api/_top-level/_util.jade similarity index 87% rename from website/docs/api/util.jade rename to website/api/_top-level/_util.jade index 2127446df..1770a111e 100644 --- a/website/docs/api/util.jade +++ b/website/api/_top-level/_util.jade @@ -1,10 +1,8 @@ -//- 💫 DOCS > API > UTIL - -include ../../_includes/_mixins +//- 💫 DOCS > API > TOP-LEVEL > UTIL p | spaCy comes with a small collection of utility functions located in - | #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. + | #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]]. | Because utility functions are mostly intended for | #[strong internal use within spaCy], their behaviour may change with | future releases. The functions documented on this page should be safe @@ -12,7 +10,7 @@ p | recommend having additional tests in place if your application depends on | any of spaCy's utilities. -+h(2, "get_data_path") util.get_data_path ++h(3, "util.get_data_path") util.get_data_path +tag function p @@ -25,12 +23,12 @@ p +cell bool +cell Only return path if it exists, otherwise return #[code None]. - +footrow + +row("foot") +cell returns +cell #[code Path] / #[code None] +cell Data path or #[code None]. -+h(2, "set_data_path") util.set_data_path ++h(3, "util.set_data_path") util.set_data_path +tag function p @@ -47,12 +45,12 @@ p +cell unicode or #[code Path] +cell Path to new data directory. -+h(2, "get_lang_class") util.get_lang_class ++h(3, "util.get_lang_class") util.get_lang_class +tag function p | Import and load a #[code Language] class. Allows lazy-loading - | #[+a("/docs/usage/adding-languages") language data] and importing + | #[+a("/usage/adding-languages") language data] and importing | languages using the two-letter language code. +aside-code("Example"). @@ -67,12 +65,12 @@ p +cell unicode +cell Two-letter language code, e.g. #[code 'en']. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell Language class. -+h(2, "load_model") util.load_model ++h(3, "util.load_model") util.load_model +tag function +tag-new(2) @@ -101,12 +99,12 @@ p +cell - +cell Specific overrides, like pipeline components to disable. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell #[code Language] class with the loaded model. -+h(2, "load_model_from_path") util.load_model_from_path ++h(3, "util.load_model_from_path") util.load_model_from_path +tag function +tag-new(2) @@ -139,18 +137,18 @@ p +cell - +cell Specific overrides, like pipeline components to disable. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell #[code Language] class with the loaded model. -+h(2, "load_model_from_init_py") util.load_model_from_init_py ++h(3, "util.load_model_from_init_py") util.load_model_from_init_py +tag function +tag-new(2) p | A helper function to use in the #[code load()] method of a model package's - | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]. +aside-code("Example"). from spacy.util import load_model_from_init_py @@ -169,12 +167,12 @@ p +cell - +cell Specific overrides, like pipeline components to disable. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell #[code Language] class with the loaded model. -+h(2, "get_model_meta") util.get_model_meta ++h(3, "util.get_model_meta") util.get_model_meta +tag function +tag-new(2) @@ -190,17 +188,17 @@ p +cell unicode or #[code Path] +cell Path to model directory. - +footrow + +row("foot") +cell returns +cell dict +cell The model's meta data. -+h(2, "is_package") util.is_package ++h(3, "util.is_package") util.is_package +tag function p | Check if string maps to a package installed via pip. Mainly used to - | validate #[+a("/docs/usage/models") model packages]. + | validate #[+a("/usage/models") model packages]. +aside-code("Example"). util.is_package('en_core_web_sm') # True @@ -212,18 +210,18 @@ p +cell unicode +cell Name of package. - +footrow + +row("foot") +cell returns +cell #[code bool] +cell #[code True] if installed package, #[code False] if not. -+h(2, "get_package_path") util.get_package_path ++h(3, "util.get_package_path") util.get_package_path +tag function +tag-new(2) p | Get path to an installed package. Mainly used to resolve the location of - | #[+a("/docs/usage/models") model packages]. Currently imports the package + | #[+a("/usage/models") model packages]. Currently imports the package | to find its path. +aside-code("Example"). @@ -236,12 +234,12 @@ p +cell unicode +cell Name of installed package. - +footrow + +row("foot") +cell returns +cell #[code Path] +cell Path to model package directory. -+h(2, "is_in_jupyter") util.is_in_jupyter ++h(3, "util.is_in_jupyter") util.is_in_jupyter +tag function +tag-new(2) @@ -257,17 +255,17 @@ p return display(HTML(html)) +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bool +cell #[code True] if in Jupyter, #[code False] if not. -+h(2, "update_exc") util.update_exc ++h(3, "util.update_exc") util.update_exc +tag function p | Update, validate and overwrite - | #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. | Used to combine global exceptions with custom, language-specific | exceptions. Will raise an error if key doesn't match #[code ORTH] values. @@ -288,20 +286,20 @@ p +cell dicts +cell Exception dictionaries to add to the base exceptions, in order. - +footrow + +row("foot") +cell returns +cell dict +cell Combined tokenizer exceptions. -+h(2, "prints") util.prints ++h(3, "util.prints") util.prints +tag function +tag-new(2) p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+api("cli") cli]. + | be used for interactive components like the command-line interface. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/api/annotation.jade b/website/api/annotation.jade new file mode 100644 index 000000000..16598371d --- /dev/null +++ b/website/api/annotation.jade @@ -0,0 +1,104 @@ +//- 💫 DOCS > API > ANNOTATION SPECS + +include ../_includes/_mixins + +p This document describes the target annotations spaCy is trained to predict. + + ++section("tokenization") + +h(2, "tokenization") Tokenization + + p + | Tokenization standards are based on the + | #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus. + | The tokenizer differs from most by including tokens for significant + | whitespace. Any sequence of whitespace characters beyond a single space + | (#[code ' ']) is included as a token. + + +aside-code("Example"). + from spacy.lang.en import English + nlp = English() + tokens = nlp('Some\nspaces and\ttab characters') + tokens_text = [t.text for t in tokens] + assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and', + '\t', 'tab', 'characters'] + + p + | The whitespace tokens are useful for much the same reason punctuation is + | – it's often an important delimiter in the text. By preserving it in the + | token output, we are able to maintain a simple alignment between the + | tokens and the original string, and we ensure that no information is + | lost during processing. + ++section("sbd") + +h(2, "sentence-boundary") Sentence boundary detection + + p + | Sentence boundaries are calculated from the syntactic parse tree, so + | features such as punctuation and capitalisation play an important but + | non-decisive role in determining the sentence boundaries. Usually this + | means that the sentence boundaries will at least coincide with clause + | boundaries, even given poorly punctuated text. + ++section("pos-tagging") + +h(2, "pos-tagging") Part-of-speech Tagging + + +aside("Tip: Understanding tags") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + + include _annotation/_pos-tags + ++section("lemmatization") + +h(2, "lemmatization") Lemmatization + + p A "lemma" is the uninflected form of a word. In English, this means: + + +list + +item #[strong Adjectives]: The form like "happy", not "happier" or "happiest" + +item #[strong Adverbs]: The form like "badly", not "worse" or "worst" + +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children" + +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written" + + p + | The lemmatization data is taken from + | #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a + | special case for pronouns: all pronouns are lemmatized to the special + | token #[code -PRON-]. + + +infobox("About spaCy's custom pronoun lemma") + | Unlike verbs and common nouns, there's no clear base form of a personal + | pronoun. Should the lemma of "me" be "I", or should we normalize person + | as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a + | novel symbol, #[code -PRON-], which is used as the lemma for + | all personal pronouns. + ++section("dependency-parsing") + +h(2, "dependency-parsing") Syntactic Dependency Parsing + + +aside("Tip: Understanding labels") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + + include _annotation/_dep-labels + ++section("named-entities") + +h(2, "named-entities") Named Entity Recognition + + +aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + + include _annotation/_named-entities + + +h(3, "biluo") BILUO Scheme + + include _annotation/_biluo + ++section("training") + +h(2, "training") Models and training data + + include _annotation/_training diff --git a/website/api/cli.jade b/website/api/cli.jade new file mode 100644 index 000000000..cd1cb22fb --- /dev/null +++ b/website/api/cli.jade @@ -0,0 +1,598 @@ +//- 💫 DOCS > API > COMMAND LINE INTERFACE + +include ../_includes/_mixins + +p + | As of v1.7.0, spaCy comes with new command line helpers to download and + | link models and show useful debugging information. For a list of available + | commands, type #[code spacy --help]. + ++h(3, "download") Download + +p + | Download #[+a("/usage/models") models] for spaCy. The downloader finds the + | best-matching compatible version, uses pip to download the model as a + | package and automatically creates a + | #[+a("/usage/models#usage") shortcut link] to load the model by name. + | Direct downloads don't perform any compatibility checks and require the + | model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). + ++code(false, "bash", "$"). + spacy download [model] [--direct] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code model] + +cell positional + +cell Model name or shortcut (#[code en], #[code de], #[code vectors]). + + +row + +cell #[code --direct], #[code -d] + +cell flag + +cell Force direct download of exact model version. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. + + +row("foot") + +cell creates + +cell directory, symlink + +cell + | The installed model package in your #[code site-packages] + | directory and a shortcut link as a symlink in #[code spacy/data]. + ++aside("Downloading best practices") + | The #[code download] command is mostly intended as a convenient, + | interactive wrapper – it performs compatibility checks and prints + | detailed messages in case things go wrong. It's #[strong not recommended] + | to use this command as part of an automated process. If you know which + | model your project needs, you should consider a + | #[+a("/usage/models#download-pip") direct download via pip], or + | uploading the model to a local PyPi installation and fetching it straight + | from there. This will also allow you to add it as a versioned package + | dependency to your project. + ++h(3, "link") Link + +p + | Create a #[+a("/usage/models#usage") shortcut link] for a model, + | either a Python package or a local directory. This will let you load + | models from any location using a custom name via + | #[+api("spacy#load") #[code spacy.load()]]. + ++infobox("Important note") + | In spaCy v1.x, you had to use the model data directory to set up a shortcut + | link for a local path. As of v2.0, spaCy expects all shortcut links to + | be #[strong loadable model packages]. If you want to load a data directory, + | call #[+api("spacy#load") #[code spacy.load()]] or + | #[+api("language#from_disk") #[code Language.from_disk()]] with the path, + | or use the #[+api("cli#package") #[code package]] command to create a + | model package. + ++code(false, "bash", "$"). + spacy link [origin] [link_name] [--force] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code origin] + +cell positional + +cell Model name if package, or path to local directory. + + +row + +cell #[code link_name] + +cell positional + +cell Name of the shortcut link to create. + + +row + +cell #[code --force], #[code -f] + +cell flag + +cell Force overwriting of existing link. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. + + +row("foot") + +cell creates + +cell symlink + +cell + | A shortcut link of the given name as a symlink in + | #[code spacy/data]. + ++h(3, "info") Info + +p + | Print information about your spaCy installation, models and local setup, + | and generate #[+a("https://en.wikipedia.org/wiki/Markdown") Markdown]-formatted + | markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues]. + ++code(false, "bash"). + spacy info [--markdown] + spacy info [model] [--markdown] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code model] + +cell positional + +cell A model, i.e. shortcut link, package name or path (optional). + + +row + +cell #[code --markdown], #[code -md] + +cell flag + +cell Print information as Markdown. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. + + +row("foot") + +cell prints + +cell #[code stdout] + +cell Information about your spaCy installation. + ++h(3, "validate") Validate + +tag-new(2) + +p + | Find all models installed in the current environment (both packages and + | shortcut links) and check whether they are compatible with the currently + | installed version of spaCy. Should be run after upgrading spaCy via + | #[code pip install -U spacy] to ensure that all installed models are + | can be used with the new version. The command is also useful to detect + | out-of-sync model links resulting from links created in different virtual + | environments. Prints a list of models, the installed versions, the latest + | compatible version (if out of date) and the commands for updating. + ++code(false, "bash", "$"). + spacy validate + ++table(["Argument", "Type", "Description"]) + +row("foot") + +cell prints + +cell #[code stdout] + +cell Details about the compatibility of your installed models. + ++h(3, "convert") Convert + +p + | Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format] + | for use with the #[code train] command and other experiment management + | functions. The converter can be specified on the command line, or + | chosen based on the file extension of the input file. + ++code(false, "bash", "$", false, false, true). + spacy convert [input_file] [output_dir] [--converter] [--n-sents] + [--morphology] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code input_file] + +cell positional + +cell Input file. + + +row + +cell #[code output_dir] + +cell positional + +cell Output directory for converted JSON file. + + +row + +cell #[code converter], #[code -c] + +cell option + +cell #[+tag-new(2)] Name of converter to use (see below). + + +row + +cell #[code --n-sents], #[code -n] + +cell option + +cell Number of sentences per document. + + +row + +cell #[code --morphology], #[code -m] + +cell option + +cell Enable appending morphology to tags. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. + + +row("foot") + +cell creates + +cell JSON + +cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format]. + +p The following converters are available: + ++table(["ID", "Description"]) + +row + +cell #[code auto] + +cell Automatically pick converter based on file extension (default). + + +row + +cell #[code conllu], #[code conll] + +cell Universal Dependencies #[code .conllu] or #[code .conll] format. + + +row + +cell #[code ner] + +cell Tab-based named entity recognition format. + + +row + +cell #[code iob] + +cell IOB named entity recognition format. + ++h(3, "train") Train + +p + | Train a model. Expects data in spaCy's + | #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model + | will be saved out to the directory. Accuracy scores and model details + | will be added to a #[+a("/usage/training#models-generating") #[code meta.json]] + | to allow packaging the model using the + | #[+api("cli#package") #[code package]] command. + ++code(false, "bash", "$", false, false, true). + spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code lang] + +cell positional + +cell Model language. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to store model in. + + +row + +cell #[code train_data] + +cell positional + +cell Location of JSON-formatted training data. + + +row + +cell #[code dev_data] + +cell positional + +cell Location of JSON-formatted dev data (optional). + + +row + +cell #[code --n-iter], #[code -n] + +cell option + +cell Number of iterations (default: #[code 20]). + + +row + +cell #[code --n-sents], #[code -ns] + +cell option + +cell Number of sentences (default: #[code 0]). + + +row + +cell #[code --use-gpu], #[code -g] + +cell option + +cell Use GPU. + + +row + +cell #[code --vectors], #[code -v] + +cell option + +cell Model to load vectors from. + + +row + +cell #[code --meta-path], #[code -m] + +cell option + +cell + | #[+tag-new(2)] Optional path to model + | #[+a("/usage/training#models-generating") #[code meta.json]]. + | All relevant properties like #[code lang], #[code pipeline] and + | #[code spacy_version] will be overwritten. + + +row + +cell #[code --version], #[code -V] + +cell option + +cell + | Model version. Will be written out to the model's + | #[code meta.json] after training. + + +row + +cell #[code --no-tagger], #[code -T] + +cell flag + +cell Don't train tagger. + + +row + +cell #[code --no-parser], #[code -P] + +cell flag + +cell Don't train parser. + + +row + +cell #[code --no-entities], #[code -N] + +cell flag + +cell Don't train NER. + + +row + +cell #[code --gold-preproc], #[code -G] + +cell flag + +cell Use gold preprocessing. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. + + +row("foot") + +cell creates + +cell model, pickle + +cell A spaCy model on each epoch, and a final #[code .pickle] file. + ++h(4, "train-hyperparams") Environment variables for hyperparameters + +tag-new(2) + +p + | spaCy lets you set hyperparameters for training via environment variables. + | This is useful, because it keeps the command simple and allows you to + | #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias] + | for your custom #[code train] command while still being able to easily + | tweak the hyperparameters. For example: + ++code(false, "bash"). + parser_hidden_depth=2 parser_maxout_pieces=1 train-parser + ++table(["Name", "Description", "Default"]) + +row + +cell #[code dropout_from] + +cell Initial dropout rate. + +cell #[code 0.2] + + +row + +cell #[code dropout_to] + +cell Final dropout rate. + +cell #[code 0.2] + + +row + +cell #[code dropout_decay] + +cell Rate of dropout change. + +cell #[code 0.0] + + +row + +cell #[code batch_from] + +cell Initial batch size. + +cell #[code 1] + + +row + +cell #[code batch_to] + +cell Final batch size. + +cell #[code 64] + + +row + +cell #[code batch_compound] + +cell Rate of batch size acceleration. + +cell #[code 1.001] + + +row + +cell #[code token_vector_width] + +cell Width of embedding tables and convolutional layers. + +cell #[code 128] + + +row + +cell #[code embed_size] + +cell Number of rows in embedding tables. + +cell #[code 7500] + + //- +row + //- +cell #[code parser_maxout_pieces] + //- +cell Number of pieces in the parser's and NER's first maxout layer. + //- +cell #[code 2] + + //- +row + //- +cell #[code parser_hidden_depth] + //- +cell Number of hidden layers in the parser and NER. + //- +cell #[code 1] + + +row + +cell #[code hidden_width] + +cell Size of the parser's and NER's hidden layers. + +cell #[code 128] + + //- +row + //- +cell #[code history_feats] + //- +cell Number of previous action ID features for parser and NER. + //- +cell #[code 128] + + //- +row + //- +cell #[code history_width] + //- +cell Number of embedding dimensions for each action ID. + //- +cell #[code 128] + + +row + +cell #[code learn_rate] + +cell Learning rate. + +cell #[code 0.001] + + +row + +cell #[code optimizer_B1] + +cell Momentum for the Adam solver. + +cell #[code 0.9] + + +row + +cell #[code optimizer_B2] + +cell Adagrad-momentum for the Adam solver. + +cell #[code 0.999] + + +row + +cell #[code optimizer_eps] + +cell Epsylon value for the Adam solver. + +cell #[code 1e-08] + + +row + +cell #[code L2_penalty] + +cell L2 regularisation penalty. + +cell #[code 1e-06] + + +row + +cell #[code grad_norm_clip] + +cell Gradient L2 norm constraint. + +cell #[code 1.0] + ++h(3, "vocab") Vocab + +tag-new(2) + +p + | Compile a vocabulary from a + | #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional + | word vectors. Will save out a valid spaCy model that you can load via + | #[+api("spacy#load") #[code spacy.load]] or package using the + | #[+api("cli#package") #[code package]] command. + ++code(false, "bash", "$"). + spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code lang] + +cell positional + +cell + | Model language + | #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code], + | e.g. #[code en]. + + +row + +cell #[code output_dir] + +cell positional + +cell Model output directory. Will be created if it doesn't exist. + + +row + +cell #[code lexemes_loc] + +cell positional + +cell + | Location of lexical data in spaCy's + | #[+a("/api/annotation#vocab-jsonl") JSONL format]. + + +row + +cell #[code vectors_loc] + +cell positional + +cell Optional location of vectors data as numpy #[code .npz] file. + + +row("foot") + +cell creates + +cell model + +cell A spaCy model containing the vocab and vectors. + ++h(3, "evaluate") Evaluate + +tag-new(2) + +p + | Evaluate a model's accuracy and speed on JSON-formatted annotated data. + | Will print the results and optionally export + | #[+a("/usage/visualizers") displaCy visualizations] of a sample set of + | parses to #[code .html] files. Visualizations for the dependency parse + | and NER will be exported as separate files if the respective component + | is present in the model's pipeline. + ++code(false, "bash", "$", false, false, true). + spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code model] + +cell positional + +cell + | Model to evaluate. Can be a package or shortcut link name, or a + | path to a model data directory. + + +row + +cell #[code data_path] + +cell positional + +cell Location of JSON-formatted evaluation data. + + +row + +cell #[code --displacy-path], #[code -dp] + +cell option + +cell + | Directory to output rendered parses as HTML. If not set, no + | visualizations will be generated. + + +row + +cell #[code --displacy-limit], #[code -dl] + +cell option + +cell + | Number of parses to generate per file. Defaults to #[code 25]. + | Keep in mind that a significantly higher number might cause the + | #[code .html] files to render slowly. + + +row + +cell #[code --gpu-id], #[code -g] + +cell option + +cell GPU to use, if any. Defaults to #[code -1] for CPU. + + +row + +cell #[code --gold-preproc], #[code -G] + +cell flag + +cell Use gold preprocessing. + + +row("foot") + +cell prints / creates + +cell #[code stdout], HTML + +cell Training results and optional displaCy visualizations. + + ++h(3, "package") Package + +p + | Generate a #[+a("/usage/training#models-generating") model Python package] + | from an existing model data directory. All data files are copied over. + | If the path to a #[code meta.json] is supplied, or a #[code meta.json] is + | found in the input directory, this file is used. Otherwise, the data can + | be entered directly from the command line. The required file templates + | are downloaded from + | #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make + | sure you're always using the latest versions. This means you need to be + | connected to the internet to use this command. After packaging, you + | can run #[code python setup.py sdist] from the newly created directory + | to turn your model into an installable archive file. + ++code(false, "bash", "$", false, false, true). + spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] + ++aside-code("Example", "bash"). + spacy package /input /output + cd /output/en_model-0.0.0 + python setup.py sdist + pip install dist/en_model-0.0.0.tar.gz + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code input_dir] + +cell positional + +cell Path to directory containing model data. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to create package folder in. + + +row + +cell #[code --meta-path], #[code -m] + +cell option + +cell #[+tag-new(2)] Path to #[code meta.json] file (optional). + + +row + +cell #[code --create-meta], #[code -c] + +cell flag + +cell + | #[+tag-new(2)] Create a #[code meta.json] file on the command + | line, even if one already exists in the directory. If an + | existing file is found, its entries will be shown as the defaults + | in the command line prompt. + +row + +cell #[code --force], #[code -f] + +cell flag + +cell Force overwriting of existing folder in output directory. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. + + +row("foot") + +cell creates + +cell directory + +cell A Python package containing the spaCy model. diff --git a/website/api/dependencyparser.jade b/website/api/dependencyparser.jade new file mode 100644 index 000000000..e557ef9da --- /dev/null +++ b/website/api/dependencyparser.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > API > DEPENDENCYPARSER + +include ../_includes/_mixins + +//- This class inherits from Pipe, so this page uses the template in pipe.jade. +!=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" }) diff --git a/website/docs/api/doc.jade b/website/api/doc.jade similarity index 76% rename from website/docs/api/doc.jade rename to website/api/doc.jade index 7fbbcce97..ac91ad427 100644 --- a/website/docs/api/doc.jade +++ b/website/api/doc.jade @@ -1,8 +1,6 @@ //- 💫 DOCS > API > DOC -include ../../_includes/_mixins - -p A container for accessing linguistic annotations. +include ../_includes/_mixins p | A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects. @@ -47,7 +45,7 @@ p | subsequent space. Must have the same length as #[code words], if | specified. Defaults to a sequence of #[code True]. - +footrow + +row("foot") +cell returns +cell #[code Doc] +cell The newly constructed object. @@ -73,7 +71,7 @@ p +cell int +cell The index of the token. - +footrow + +row("foot") +cell returns +cell #[code Token] +cell The token at #[code doc[i]]. @@ -96,7 +94,7 @@ p +cell tuple +cell The slice of the document to get. - +footrow + +row("foot") +cell returns +cell #[code Span] +cell The span at #[code doc[start : end]]. @@ -120,7 +118,7 @@ p | from Cython. +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A #[code Token] object. @@ -135,11 +133,114 @@ p Get the number of tokens in the document. assert len(doc) == 7 +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of tokens in the document. ++h(2, "set_extension") Doc.set_extension + +tag classmethod + +tag-new(2) + +p + | Define a custom attribute on the #[code Doc] which becomes available via + | #[code Doc._]. For details, see the documentation on + | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. + ++aside-code("Example"). + from spacy.tokens import Doc + city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin') + Doc.set_extension('has_city', getter=city_getter) + doc = nlp(u'I like New York') + assert doc._.has_city + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Name of the attribute to set by the extension. For example, + | #[code 'my_attr'] will be available as #[code doc._.my_attr]. + + +row + +cell #[code default] + +cell - + +cell + | Optional default value of the attribute if no getter or method + | is defined. + + +row + +cell #[code method] + +cell callable + +cell + | Set a custom method on the object, for example + | #[code doc._.compare(other_doc)]. + + +row + +cell #[code getter] + +cell callable + +cell + | Getter function that takes the object and returns an attribute + | value. Is called when the user accesses the #[code ._] attribute. + + +row + +cell #[code setter] + +cell callable + +cell + | Setter function that takes the #[code Doc] and a value, and + | modifies the object. Is called when the user writes to the + | #[code Doc._] attribute. + ++h(2, "get_extension") Doc.get_extension + +tag classmethod + +tag-new(2) + +p + | Look up a previously registered extension by name. Returns a 4-tuple + | #[code.u-break (default, method, getter, setter)] if the extension is + | registered. Raises a #[code KeyError] otherwise. + ++aside-code("Example"). + from spacy.tokens import Doc + Doc.set_extension('is_city', default=False) + extension = Doc.get_extension('is_city') + assert extension == (False, None, None, None) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension. + + +row("foot") + +cell returns + +cell tuple + +cell + | A #[code.u-break (default, method, getter, setter)] tuple of the + | extension. + ++h(2, "has_extension") Doc.has_extension + +tag classmethod + +tag-new(2) + +p Check whether an extension has been registered on the #[code Doc] class. + ++aside-code("Example"). + from spacy.tokens import Doc + Doc.set_extension('is_city', default=False) + assert Doc.has_extension('is_city') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the extension has been registered. + +h(2, "char_span") Doc.char_span +tag method +tag-new(2) @@ -172,7 +273,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]]. +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A meaning representation of the span. - +footrow + +row("foot") +cell returns +cell #[code Span] +cell The newly constructed object. @@ -200,7 +301,7 @@ p | The object to compare with. By default, accepts #[code Doc], | #[code Span], #[code Token] and #[code Lexeme] objects. - +footrow + +row("foot") +cell returns +cell float +cell A scalar similarity score. Higher is more similar. @@ -226,37 +327,69 @@ p +cell int +cell The attribute ID - +footrow + +row("foot") +cell returns +cell dict +cell A dictionary mapping attributes to integer counts. ++h(2, "get_lca_matrix") Doc.get_lca_matrix + +tag method + +p + | Calculates the lowest common ancestor matrix for a given #[code Doc]. + | Returns LCA matrix containing the integer index of the ancestor, or + | #[code -1] if no common ancestor is found, e.g. if span excludes a + | necessary ancestor. + ++aside-code("Example"). + doc = nlp(u"This is a test") + matrix = doc.get_lca_matrix() + # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell The lowest common ancestor matrix of the #[code Doc]. + +h(2, "to_array") Doc.to_array +tag method p - | Export the document annotations to a numpy array of shape #[code N*M] - | where #[code N] is the length of the document and #[code M] is the number - | of attribute IDs to export. The values will be 32-bit integers. + | Export given token attributes to a numpy #[code ndarray]. + | If #[code attr_ids] is a sequence of #[code M] attributes, + | the output array will be of shape #[code (N, M)], where #[code N] + | is the length of the #[code Doc] (in tokens). If #[code attr_ids] is + | a single attribute, the output shape will be #[code (N,)]. You can + | specify attributes by integer ID (e.g. #[code spacy.attrs.LEMMA]) + | or string name (e.g. 'LEMMA' or 'lemma'). The values will be 64-bit + | integers. +aside-code("Example"). from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA doc = nlp(text) # All strings mapped to integers, for easy export to numpy np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + np_array = doc.to_array("POS") +table(["Name", "Type", "Description"]) +row +cell #[code attr_ids] - +cell list - +cell A list of attribute ID ints. + +cell list or int or string + +cell + | A list of attributes (int IDs or string names) or + | a single attribute (int ID or string name) - +footrow + +row("foot") +cell returns - +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell + | #[code.u-break numpy.ndarray[ndim=2, dtype='uint64']] or + | #[code.u-break numpy.ndarray[ndim=1, dtype='uint64']] or +cell | The exported attributes as a 2D numpy array, with one row per - | token and one column per attribute. + | token and one column per attribute (when #[code attr_ids] is a + | list), or as a 1D numpy array, with one item per attribute (when + | #[code attr_ids] is a single value). +h(2, "from_array") Doc.from_array +tag method @@ -285,7 +418,7 @@ p +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell The attribute values to load. - +footrow + +row("foot") +cell returns +cell #[code Doc] +cell Itself. @@ -326,7 +459,7 @@ p Loads state from a directory. Modifies the object in place and returns it. | A path to a directory. Paths may be either strings or | #[code Path]-like objects. - +footrow + +row("foot") +cell returns +cell #[code Doc] +cell The modified #[code Doc] object. @@ -341,7 +474,7 @@ p Serialize, i.e. export the document contents to a binary string. doc_bytes = doc.to_bytes() +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bytes +cell @@ -367,7 +500,7 @@ p Deserialize, i.e. import the document contents from a binary string. +cell bytes +cell The string to load from. - +footrow + +row("foot") +cell returns +cell #[code Doc] +cell The #[code Doc] object. @@ -378,7 +511,7 @@ p Deserialize, i.e. import the document contents from a binary string. p | Retokenize the document, such that the span at | #[code doc.text[start_idx : end_idx]] is merged into a single token. If - | #[code start_idx] and #[end_idx] do not mark start and end token + | #[code start_idx] and #[code end_idx] do not mark start and end token | boundaries, the document remains unchanged. +aside-code("Example"). @@ -405,7 +538,7 @@ p | attributes are inherited from the syntactic root token of | the span. - +footrow + +row("foot") +cell returns +cell #[code Token] +cell @@ -440,7 +573,7 @@ p +cell bool +cell Don't include arcs or modifiers. - +footrow + +row("foot") +cell returns +cell dict +cell Parse tree as dict. @@ -462,7 +595,7 @@ p assert ents[0].text == 'Mr. Best' +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Span] +cell Entities in the document. @@ -485,7 +618,7 @@ p assert chunks[1].text == "another phrase" +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Span] +cell Noun chunks in the document. @@ -507,7 +640,7 @@ p assert [s.root.text for s in sents] == ["is", "'s"] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Span] +cell Sentences in the document. @@ -525,7 +658,7 @@ p assert doc.has_vector +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bool +cell Whether the document has a vector data attached. @@ -544,7 +677,7 @@ p assert doc.vector.shape == (300,) +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the document's semantics. @@ -564,7 +697,7 @@ p assert doc1.vector_norm != doc2.vector_norm +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell float +cell The L2 norm of the vector representation. @@ -651,3 +784,10 @@ p +cell | A dictionary that allows customisation of properties of | #[code Span] children. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/api/entityrecognizer.jade b/website/api/entityrecognizer.jade new file mode 100644 index 000000000..a8b68e453 --- /dev/null +++ b/website/api/entityrecognizer.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > API > ENTITYRECOGNIZER + +include ../_includes/_mixins + +//- This class inherits from Pipe, so this page uses the template in pipe.jade. +!=partial("pipe", { subclass: "EntityRecognizer", short: "ner", pipeline_id: "ner" }) diff --git a/website/docs/api/goldcorpus.jade b/website/api/goldcorpus.jade similarity index 71% rename from website/docs/api/goldcorpus.jade rename to website/api/goldcorpus.jade index 3b3d92823..0f7105f65 100644 --- a/website/docs/api/goldcorpus.jade +++ b/website/api/goldcorpus.jade @@ -1,14 +1,12 @@ //- 💫 DOCS > API > GOLDCORPUS -include ../../_includes/_mixins +include ../_includes/_mixins p - | An annotated corpus, using the JSON file format. Manages annotations for - | tagging, dependency parsing and NER. + | This class manages annotations for tagging, dependency parsing and NER. +h(2, "init") GoldCorpus.__init__ +tag method - +tag-new(2) p Create a #[code GoldCorpus]. diff --git a/website/docs/api/goldparse.jade b/website/api/goldparse.jade similarity index 95% rename from website/docs/api/goldparse.jade rename to website/api/goldparse.jade index 03118343d..c27badee9 100644 --- a/website/docs/api/goldparse.jade +++ b/website/api/goldparse.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > GOLDPARSE -include ../../_includes/_mixins +include ../_includes/_mixins p Collection for training annotations. @@ -40,7 +40,7 @@ p Create a #[code GoldParse]. +cell iterable +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions. - +footrow + +row("foot") +cell returns +cell #[code GoldParse] +cell The newly constructed object. @@ -51,7 +51,7 @@ p Create a #[code GoldParse]. p Get the number of gold-standard tokens. +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of gold-standard tokens. @@ -64,7 +64,7 @@ p | tree. +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bool +cell Whether annotations form projective tree. @@ -119,7 +119,7 @@ p p | Encode labelled spans into per-token tags, using the - | #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out). + | #[+a("/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out). p | Returns a list of unicode strings, describing the tags. Each tag string @@ -157,11 +157,11 @@ p | and #[code end] should be character-offset integers denoting the | slice into the original string. - +footrow + +row("foot") +cell returns +cell list +cell | Unicode strings, describing the - | #[+a("/docs/api/annotation#biluo") BILUO] tags. + | #[+a("/api/annotation#biluo") BILUO] tags. diff --git a/website/api/index.jade b/website/api/index.jade new file mode 100644 index 000000000..8035c9ff5 --- /dev/null +++ b/website/api/index.jade @@ -0,0 +1,14 @@ +//- 💫 DOCS > API > ARCHITECTURE + +include ../_includes/_mixins + ++section("basics") + include ../usage/_spacy-101/_architecture + ++section("nn-model") + +h(2, "nn-model") Neural network model architecture + include _architecture/_nn-model + ++section("cython") + +h(2, "cython") Cython conventions + include _architecture/_cython diff --git a/website/docs/api/language.jade b/website/api/language.jade similarity index 55% rename from website/docs/api/language.jade rename to website/api/language.jade index 69665ee9d..f86257f38 100644 --- a/website/docs/api/language.jade +++ b/website/api/language.jade @@ -1,10 +1,17 @@ //- 💫 DOCS > API > LANGUAGE -include ../../_includes/_mixins +include ../_includes/_mixins p - | A text-processing pipeline. Usually you'll load this once per process, - | and pass the instance around your application. + | Usually you'll load this once per process as #[code nlp] and pass the + | instance around your application. The #[code Language] class is created + | when you call #[+api("spacy#load") #[code spacy.load()]] and contains + | the shared vocabulary and #[+a("/usage/adding-languages") language data], + | optional model data loaded from a #[+a("/models") model package] or + | a path, and a #[+a("/usage/processing-pipelines") processing pipeline] + | containing components like the tagger or parser that are called on a + | document in order. You can also add your own processing pipeline + | components that take a #[code Doc] object, modify it and return it. +h(2, "init") Language.__init__ +tag method @@ -12,9 +19,9 @@ p p Initialise a #[code Language] object. +aside-code("Example"). + from spacy.vocab import Vocab from spacy.language import Language - nlp = Language(pipeline=['token_vectors', 'tags', - 'dependencies']) + nlp = Language(Vocab()) from spacy.lang.en import English nlp = English() @@ -34,14 +41,6 @@ p Initialise a #[code Language] object. | A function that takes text and returns a #[code Doc] object. | Usually a #[code Tokenizer]. - +row - +cell #[code pipeline] - +cell list - +cell - | A list of annotation processes or IDs of annotation, processes, - | e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked - | up in #[code Language.Defaults.factories]. - +row +cell #[code meta] +cell dict @@ -49,7 +48,7 @@ p Initialise a #[code Language] object. | Custom meta data for the #[code Language] class. Is written to by | models to add model meta data. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell The newly constructed object. @@ -77,14 +76,14 @@ p +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + | #[+a("/usage/processing-pipelines#disabling") disable]. - +footrow + +row("foot") +cell returns +cell #[code Doc] +cell A container for accessing the annotations. -+infobox("⚠️ Deprecation note") ++infobox("Deprecation note", "⚠️") .o-block | Pipeline components to prevent from being loaded can now be added as | a list to #[code disable], instead of specifying one keyword argument @@ -136,9 +135,9 @@ p +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + | #[+a("/usage/processing-pipelines#disabling") disable]. - +footrow + +row("foot") +cell yields +cell #[code Doc] +cell Documents in the order of the original text. @@ -175,7 +174,7 @@ p Update the models in the pipeline. +cell callable +cell An optimizer. - +footrow + +row("foot") +cell returns +cell dict +cell Results from the update. @@ -200,7 +199,7 @@ p +cell - +cell Config parameters. - +footrow + +row("foot") +cell yields +cell tuple +cell An optimizer. @@ -230,23 +229,249 @@ p +cell Config parameters. +h(2, "preprocess_gold") Language.preprocess_gold + +tag method p | Can be called before training to pre-process gold data. By default, it | handles nonprojectivity and adds missing tags to the tag map. - +table(["Name", "Type", "Description"]) +row +cell #[code docs_golds] +cell iterable +cell Tuples of #[code Doc] and #[code GoldParse] objects. - +footrow + +row("foot") +cell yields +cell tuple +cell Tuples of #[code Doc] and #[code GoldParse] objects. ++h(2, "create_pipe") Language.create_pipe + +tag method + +tag-new(2) + +p Create a pipeline component from a factory. + ++aside-code("Example"). + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Factory name to look up in + | #[+api("language#class-attributes") #[code Language.factories]]. + + +row + +cell #[code config] + +cell dict + +cell Configuration parameters to initialise component. + + +row("foot") + +cell returns + +cell callable + +cell The pipeline component. + ++h(2, "add_pipe") Language.add_pipe + +tag method + +tag-new(2) + +p + | Add a component to the processing pipeline. Valid components are + | callables that take a #[code Doc] object, modify it and return it. Only + | one of #[code before], #[code after], #[code first] or #[code last] can + | be set. Default behaviour is #[code last=True]. + ++aside-code("Example"). + def component(doc): + # modify Doc and return it + return doc + + nlp.add_pipe(component, before='ner') + nlp.add_pipe(component, name='custom_name', last=True) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code component] + +cell callable + +cell The pipeline component. + + +row + +cell #[code name] + +cell unicode + +cell + | Name of pipeline component. Overwrites existing + | #[code component.name] attribute if available. If no #[code name] + | is set and the component exposes no name attribute, + | #[code component.__name__] is used. An error is raised if the + | name already exists in the pipeline. + + +row + +cell #[code before] + +cell unicode + +cell Component name to insert component directly before. + + +row + +cell #[code after] + +cell unicode + +cell Component name to insert component directly after: + + +row + +cell #[code first] + +cell bool + +cell Insert component first / not first in the pipeline. + + +row + +cell #[code last] + +cell bool + +cell Insert component last / not last in the pipeline. + ++h(2, "has_pipe") Language.has_pipe + +tag method + +tag-new(2) + +p + | Check whether a component is present in the pipeline. Equivalent to + | #[code name in nlp.pipe_names]. + ++aside-code("Example"). + nlp.add_pipe(lambda doc: doc, name='component') + assert 'component' in nlp.pipe_names + assert nlp.has_pipe('component') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the pipeline component to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether a component of that name exists in the pipeline. + ++h(2, "get_pipe") Language.get_pipe + +tag method + +tag-new(2) + +p Get a pipeline component for a given component name. + ++aside-code("Example"). + parser = nlp.get_pipe('parser') + custom_component = nlp.get_pipe('custom_component') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the pipeline component to get. + + +row("foot") + +cell returns + +cell callable + +cell The pipeline component. + ++h(2, "replace_pipe") Language.replace_pipe + +tag method + +tag-new(2) + +p Replace a component in the pipeline. + ++aside-code("Example"). + nlp.replace_pipe('parser', my_custom_parser) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the component to replace. + + +row + +cell #[code component] + +cell callable + +cell The pipeline component to inser. + + ++h(2, "rename_pipe") Language.rename_pipe + +tag method + +tag-new(2) + +p + | Rename a component in the pipeline. Useful to create custom names for + | pre-defined and pre-loaded components. To change the default name of + | a component added to the pipeline, you can also use the #[code name] + | argument on #[+api("language#add_pipe") #[code add_pipe]]. + ++aside-code("Example"). + nlp.rename_pipe('parser', 'spacy_parser') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code old_name] + +cell unicode + +cell Name of the component to rename. + + +row + +cell #[code new_name] + +cell unicode + +cell New name of the component. + ++h(2, "remove_pipe") Language.remove_pipe + +tag method + +tag-new(2) + +p + | Remove a component from the pipeline. Returns the removed component name + | and component function. + ++aside-code("Example"). + name, component = nlp.remove_pipe('parser') + assert name == 'parser' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the component to remove. + + +row("foot") + +cell returns + +cell tuple + +cell A #[code (name, component)] tuple of the removed component. + ++h(2, "disable_pipes") Language.disable_pipes + +tag contextmanager + +tag-new(2) + +p + | Disable one or more pipeline components. If used as a context manager, + | the pipeline will be restored to the initial state at the end of the + | block. Otherwise, a #[code DisabledPipes] object is returned, that has a + | #[code .restore()] method you can use to undo your changes. + ++aside-code("Example"). + with nlp.disable_pipes('tagger', 'parser'): + optimizer = nlp.begin_training(gold_tuples) + + disabled = nlp.disable_pipes('tagger', 'parser') + optimizer = nlp.begin_training(gold_tuples) + disabled.restore() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code *disabled] + +cell unicode + +cell Names of pipeline components to disable. + + +row("foot") + +cell returns + +cell #[code DisabledPipes] + +cell + | The disabled pipes that can be restored by calling the object's + | #[code .restore()] method. + +h(2, "to_disk") Language.to_disk +tag method +tag-new(2) @@ -271,7 +496,7 @@ p +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | #[+a("/usage/processing-pipelines#disabling") disable] | and prevent from being saved. +h(2, "from_disk") Language.from_disk @@ -300,14 +525,14 @@ p +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + | #[+a("/usage/processing-pipelines#disabling") disable]. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell The modified #[code Language] object. -+infobox("⚠️ Deprecation note") ++infobox("Deprecation note", "⚠️") .o-block | As of spaCy v2.0, the #[code save_to_directory] method has been | renamed to #[code to_disk], to improve consistency across classes. @@ -332,10 +557,10 @@ p Serialize the current state to a binary string. +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | #[+a("/usage/processing-pipelines#disabling") disable] | and prevent from being serialized. - +footrow + +row("foot") +cell returns +cell bytes +cell The serialized form of the #[code Language] object. @@ -362,14 +587,14 @@ p Load state from a binary string. +cell list +cell | Names of pipeline components to - | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + | #[+a("/usage/processing-pipelines#disabling") disable]. - +footrow + +row("foot") +cell returns +cell #[code Language] +cell The #[code Language] object. -+infobox("⚠️ Deprecation note") ++infobox("Deprecation note", "⚠️") .o-block | Pipeline components to prevent from being loaded can now be added as | a list to #[code disable], instead of specifying one keyword argument @@ -399,7 +624,15 @@ p Load state from a binary string. +row +cell #[code pipeline] +cell list - +cell Sequence of annotation functions. + +cell + | List of #[code (name, component)] tuples describing the current + | processing pipeline, in order. + + +row + +cell #[code pipe_names] + +tag-new(2) + +cell list + +cell List of pipeline component names, in order. +row +cell #[code meta] @@ -408,6 +641,14 @@ p Load state from a binary string. | Custom meta data for the Language class. If a model is loaded, | contains meta data of the model. + +row + +cell #[code path] + +tag-new(2) + +cell #[code Path] + +cell + | Path to the model data directory, if a model is loaded. Otherwise + | #[code None]. + +h(2, "class-attributes") Class attributes +table(["Name", "Type", "Description"]) @@ -424,3 +665,12 @@ p Load state from a binary string. +cell | Two-letter language ID, i.e. | #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. + + +row + +cell #[code factories] + +tag-new(2) + +cell dict + +cell + | Factories that create pre-defined pipeline components, e.g. the + | tagger, parser or entity recognizer, keyed by their component + | name. diff --git a/website/api/lemmatizer.jade b/website/api/lemmatizer.jade new file mode 100644 index 000000000..eb061f10a --- /dev/null +++ b/website/api/lemmatizer.jade @@ -0,0 +1,160 @@ +//- 💫 DOCS > API > LEMMATIZER + +include ../_includes/_mixins + +p + | The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix + | rules and lookup tables. + ++h(2, "init") Lemmatizer.__init__ + +tag method + +p Create a #[code Lemmatizer]. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + lemmatizer = Lemmatizer() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exceptions] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup] + +cell dict / #[code None] + +cell Lookup table mapping string to their lemmas. + + +row("foot") + +cell returns + +cell #[code Lemmatizer] + +cell The newly created object. + ++h(2, "call") Lemmatizer.__call__ + +tag method + +p Lemmatize a string. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES + lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) + lemmas = lemmatizer(u'ducks', u'NOUN') + assert lemmas == [u'duck'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to lemmatize, e.g. the token text. + + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict / #[code None] + +cell + | Morphological features following the + | #[+a("http://universaldependencies.org/") Universal Dependencies] + | scheme. + + +row("foot") + +cell returns + +cell list + +cell The available lemmas for the string. + ++h(2, "lookup") Lemmatizer.lookup + +tag method + +tag-new(2) + +p + | Look up a lemma in the lookup table, if available. If no lemma is found, + | the original string is returned. Languages can provide a + | #[+a("/usage/adding-languages#lemmatizer") lookup table] via the + | #[code lemma_lookup] variable, set on the individual #[code Language] + | class. + ++aside-code("Example"). + lookup = {u'going': u'go'} + lemmatizer = Lemmatizer(lookup=lookup) + assert lemmatizer.lookup(u'going') == u'go' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to look up. + + +row("foot") + +cell returns + +cell unicode + +cell The lemma if the string was found, otherwise the original string. + ++h(2, "is_base_form") Lemmatizer.is_base_form + +tag method + +p + | Check whether we're dealing with an uninflected paradigm, so we can + | avoid lemmatization entirely. + ++aside-code("Example"). + pos = 'verb' + morph = {'VerbForm': 'inf'} + is_base_form = lemmatizer.is_base_form(pos, morph) + assert is_base_form == True + ++table(["Name", "Type", "Description"]) + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict + +cell The token's morphological features. + + +row("foot") + +cell returns + +cell bool + +cell + | Whether the token's part-of-speech tag and morphological features + | describe a base form. + ++h(2, "attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exc] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup_table] + +tag-new(2) + +cell dict / #[code None] + +cell The lemma lookup table, if available. diff --git a/website/docs/api/lexeme.jade b/website/api/lexeme.jade similarity index 73% rename from website/docs/api/lexeme.jade rename to website/api/lexeme.jade index 6e3f68493..86fa18730 100644 --- a/website/docs/api/lexeme.jade +++ b/website/api/lexeme.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > LEXEME -include ../../_includes/_mixins +include ../_includes/_mixins p | An entry in the vocabulary. A #[code Lexeme] has no string context – it's @@ -24,7 +24,7 @@ p Create a #[code Lexeme] object. +cell int +cell The orth id of the lexeme. - +footrow + +row("foot") +cell returns +cell #[code Lexeme] +cell The newly constructed object. @@ -65,7 +65,7 @@ p Check the value of a boolean flag. +cell int +cell The attribute ID of the flag to query. - +footrow + +row("foot") +cell returns +cell bool +cell The value of the flag. @@ -91,7 +91,7 @@ p Compute a semantic similarity estimate. Defaults to cosine over vectors. | The object to compare with. By default, accepts #[code Doc], | #[code Span], #[code Token] and #[code Lexeme] objects. - +footrow + +row("foot") +cell returns +cell float +cell A scalar similarity score. Higher is more similar. @@ -110,7 +110,7 @@ p assert apple.has_vector +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bool +cell Whether the lexeme has a vector data attached. @@ -127,7 +127,7 @@ p A real-valued meaning representation. assert apple.vector.shape == (300,) +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the lexeme's semantics. @@ -146,7 +146,7 @@ p The L2 norm of the lexeme's vector representation. assert apple.vector_norm != pasta.vector_norm +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell float +cell The L2 norm of the vector representation. @@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code vocab] +cell #[code Vocab] - +cell + +cell The lexeme's vocabulary. +row +cell #[code text] +cell unicode +cell Verbatim text content. + +row + +cell #[code orth] + +cell int + +cell ID of the verbatim text content. + + +row + +cell #[code orth_] + +cell unicode + +cell + | Verbatim text content (identical to #[code Lexeme.text]). Existst + | mostly for consistency with the other attributes. + +row +cell #[code lex_id] +cell int +cell ID of the lexeme's lexical type. + +row + +cell #[code rank] + +cell int + +cell + | Sequential ID of the lexemes's lexical type, used to index into + | tables, e.g. for word vectors. + + +row + +cell #[code flags] + +cell int + +cell Container of the lexeme's binary flags. + + +row + +cell #[code norm] + +cell int + +cell The lexemes's norm, i.e. a normalised form of the lexeme text. + + +row + +cell #[code norm_] + +cell unicode + +cell The lexemes's norm, i.e. a normalised form of the lexeme text. + +row +cell #[code lower] +cell int - +cell Lower-case form of the word. + +cell Lowercase form of the word. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the word. + +cell Lowercase form of the word. +row +cell #[code shape] @@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code prefix] +cell int - +cell Length-N substring from the start of the word. Defaults to #[code N=1]. + +cell + | Length-N substring from the start of the word. Defaults to + | #[code N=1]. +row +cell #[code prefix_] +cell unicode - +cell Length-N substring from the start of the word. Defaults to #[code N=1]. + +cell + | Length-N substring from the start of the word. Defaults to + | #[code N=1]. +row +cell #[code suffix] +cell int - +cell Length-N substring from the end of the word. Defaults to #[code N=3]. + +cell + | Length-N substring from the end of the word. Defaults to + | #[code N=3]. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the start of the word. Defaults to #[code N=3]. + +cell + | Length-N substring from the start of the word. Defaults to + | #[code N=3]. +row +cell #[code is_alpha] @@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation. | Is the lexeme in lowercase? Equivalent to | #[code lexeme.text.islower()]. + +row + +cell #[code is_upper] + +cell bool + +cell + | Is the lexeme in uppercase? Equivalent to + | #[code lexeme.text.isupper()]. + +row +cell #[code is_title] +cell bool @@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation. +cell bool +cell Is the lexeme punctuation? + +row + +cell #[code is_left_punct] + +cell bool + +cell Is the lexeme a left punctuation mark, e.g. #[code (]? + + +row + +cell #[code is_right_punct] + +cell bool + +cell Is the lexeme a right punctuation mark, e.g. #[code )]? + +row +cell #[code is_space] +cell bool @@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation. | Does the lexeme consist of whitespace characters? Equivalent to | #[code lexeme.text.isspace()]. + +row + +cell #[code is_bracket] + +cell bool + +cell Is the lexeme a bracket? + + +row + +cell #[code is_quote] + +cell bool + +cell Is the lexeme a quotation mark? + +row +cell #[code like_url] +cell bool @@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation. +cell #[code lang] +cell int +cell Language of the parent vocabulary. + +row +cell #[code lang_] +cell unicode @@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code prob] +cell float - +cell Smoothed log probability estimate of lexeme's type. + +cell Smoothed log probability estimate of the lexeme's type. + + +row + +cell #[code cluster] + +cell int + +cell Brown cluster ID. +row +cell #[code sentiment] +cell float - +cell A scalar value indicating the positivity or negativity of the lexeme. + +cell + | A scalar value indicating the positivity or negativity of the + | lexeme. diff --git a/website/docs/api/matcher.jade b/website/api/matcher.jade similarity index 96% rename from website/docs/api/matcher.jade rename to website/api/matcher.jade index 95819e553..35aba4cba 100644 --- a/website/docs/api/matcher.jade +++ b/website/api/matcher.jade @@ -1,10 +1,8 @@ //- 💫 DOCS > API > MATCHER -include ../../_includes/_mixins +include ../_includes/_mixins -p Match sequences of tokens, based on pattern rules. - -+infobox("⚠️ Deprecation note") ++infobox("Deprecation note", "⚠️") | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] | are deprecated and have been replaced with a simpler | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of @@ -39,7 +37,7 @@ p Create the rule-based #[code Matcher]. +cell dict +cell Patterns to add to the matcher, keyed by ID. - +footrow + +row("foot") +cell returns +cell #[code Matcher] +cell The newly constructed object. @@ -64,7 +62,7 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. +cell #[code Doc] +cell The document to match over. - +footrow + +row("foot") +cell returns +cell list +cell @@ -81,7 +79,7 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. | actions per pattern within the same matcher. For example, you might only | want to merge some entity types, and set custom flags for other matched | patterns. For more details and examples, see the usage guide on - | #[+a("/docs/usage/rule-based-matching") rule-based matching]. + | #[+a("/usage/linguistic-features#rule-based-matching") rule-based matching]. +h(2, "pipe") Matcher.pipe +tag method @@ -113,7 +111,7 @@ p Match a stream of documents, yielding them in turn. | parallel, if the #[code Matcher] implementation supports | multi-threading. - +footrow + +row("foot") +cell yields +cell #[code Doc] +cell Documents, in order. @@ -134,7 +132,7 @@ p assert len(matcher) == 1 +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of rules. @@ -156,7 +154,8 @@ p Check whether the matcher contains rules for a match ID. +cell #[code key] +cell unicode +cell The match ID. - +footrow + + +row("foot") +cell returns +cell int +cell Whether the matcher contains rules for this match ID. @@ -203,7 +202,7 @@ p | Match pattern. A pattern consists of a list of dicts, where each | dict describes a token. -+infobox("⚠️ Deprecation note") ++infobox("Deprecation note", "⚠️") .o-block | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] | are deprecated and have been replaced with a simpler @@ -257,7 +256,7 @@ p +cell unicode +cell The ID of the match rule. - +footrow + +row("foot") +cell returns +cell tuple +cell The rule, as an #[code (on_match, patterns)] tuple. diff --git a/website/api/phrasematcher.jade b/website/api/phrasematcher.jade new file mode 100644 index 000000000..223ec11f9 --- /dev/null +++ b/website/api/phrasematcher.jade @@ -0,0 +1,181 @@ +//- 💫 DOCS > API > PHRASEMATCHER + +include ../_includes/_mixins + +p + | The #[code PhraseMatcher] lets you efficiently match large terminology + | lists. While the #[+api("matcher") #[code Matcher]] lets you match + | squences based on lists of token descriptions, the #[code PhraseMatcher] + | accepts match patterns in the form of #[code Doc] objects. + ++h(2, "init") PhraseMatcher.__init__ + +tag method + +p Create the rule-based #[code PhraseMatcher]. + ++aside-code("Example"). + from spacy.matcher import PhraseMatcher + matcher = PhraseMatcher(nlp.vocab, max_length=6) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell + | The vocabulary object, which must be shared with the documents + | the matcher will operate on. + + +row + +cell #[code max_length] + +cell int + +cell Mamimum length of a phrase pattern to add. + + +row("foot") + +cell returns + +cell #[code PhraseMatcher] + +cell The newly constructed object. + ++h(2, "call") PhraseMatcher.__call__ + +tag method + +p Find all token sequences matching the supplied patterns on the #[code Doc]. + ++aside-code("Example"). + from spacy.matcher import PhraseMatcher + + matcher = PhraseMatcher(nlp.vocab) + matcher.add('OBAMA', None, nlp(u"Barack Obama")) + doc = nlp(u"Barack Obama lifts America one last time in emotional farewell") + matches = matcher(doc) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The document to match over. + + +row("foot") + +cell returns + +cell list + +cell + | A list of #[code (match_id, start, end)] tuples, describing the + | matches. A match tuple describes a span #[code doc[start:end]]. + | The #[code match_id] is the ID of the added match pattern. + ++h(2, "pipe") PhraseMatcher.pipe + +tag method + +p Match a stream of documents, yielding them in turn. + ++aside-code("Example"). + from spacy.matcher import PhraseMatcher + matcher = PhraseMatcher(nlp.vocab) + for doc in matcher.pipe(texts, batch_size=50, n_threads=4): + pass + ++table(["Name", "Type", "Description"]) + +row + +cell #[code docs] + +cell iterable + +cell A stream of documents. + + +row + +cell #[code batch_size] + +cell int + +cell The number of documents to accumulate into a working set. + + +row + +cell #[code n_threads] + +cell int + +cell + | The number of threads with which to work on the buffer in + | parallel, if the #[code PhraseMatcher] implementation supports + | multi-threading. + + +row("foot") + +cell yields + +cell #[code Doc] + +cell Documents, in order. + ++h(2, "len") PhraseMatcher.__len__ + +tag method + +p + | Get the number of rules added to the matcher. Note that this only returns + | the number of rules (identical with the number of IDs), not the number + | of individual patterns. + ++aside-code("Example"). + matcher = PhraseMatcher(nlp.vocab) + assert len(matcher) == 0 + matcher.add('OBAMA', None, nlp(u"Barack Obama")) + assert len(matcher) == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of rules. + ++h(2, "contains") PhraseMatcher.__contains__ + +tag method + +p Check whether the matcher contains rules for a match ID. + ++aside-code("Example"). + matcher = PhraseMatcher(nlp.vocab) + assert 'OBAMA' not in matcher + matcher.add('OBAMA', None, nlp(u"Barack Obama")) + assert 'OBAMA' in matcher + ++table(["Name", "Type", "Description"]) + +row + +cell #[code key] + +cell unicode + +cell The match ID. + + +row("foot") + +cell returns + +cell int + +cell Whether the matcher contains rules for this match ID. + ++h(2, "add") PhraseMatcher.add + +tag method + +p + | Add a rule to the matcher, consisting of an ID key, one or more patterns, and + | a callback function to act on the matches. The callback function will + | receive the arguments #[code matcher], #[code doc], #[code i] and + | #[code matches]. If a pattern already exists for the given ID, the + | patterns will be extended. An #[code on_match] callback will be + | overwritten. + ++aside-code("Example"). + def on_match(matcher, doc, id, matches): + print('Matched!', matches) + + matcher = PhraseMatcher(nlp.vocab) + matcher.add('OBAMA', on_match, nlp(u"Barack Obama")) + matcher.add('HEALTH', on_match, nlp(u"health care reform"), + nlp(u"healthcare reform")) + doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms") + matches = matcher(doc) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code match_id] + +cell unicode + +cell An ID for the thing you're matching. + + +row + +cell #[code on_match] + +cell callable or #[code None] + +cell + | Callback function to act on matches. Takes the arguments + | #[code matcher], #[code doc], #[code i] and #[code matches]. + + +row + +cell #[code *docs] + +cell list + +cell + | #[code Doc] objects of the phrases to match. diff --git a/website/api/pipe.jade b/website/api/pipe.jade new file mode 100644 index 000000000..c2afbde12 --- /dev/null +++ b/website/api/pipe.jade @@ -0,0 +1,405 @@ +//- 💫 DOCS > API > PIPE + +include ../_includes/_mixins + +//- This page can be used as a template for all other classes that inherit +//- from `Pipe`. + +if subclass + +infobox + | This class is a subclass of #[+api("pipe") #[code Pipe]] and + | follows the same API. The pipeline component is available in the + | #[+a("/usage/processing-pipelines") processing pipeline] via the ID + | #[code "#{pipeline_id}"]. + +else + p + | This class is not instantiated directly. Components inherit from it, + | and it defines the interface that components should follow to + | function as components in a spaCy analysis pipeline. + +- CLASSNAME = subclass || 'Pipe' +- VARNAME = short || CLASSNAME.toLowerCase() + + ++h(2, "model") #{CLASSNAME}.Model + +tag classmethod + +p + | Initialise a model for the pipe. The model should implement the + | #[code thinc.neural.Model] API. Wrappers are available for + | #[+a("/usage/deep-learning") most major machine learning libraries]. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code **kwargs] + +cell - + +cell Parameters for initialising the model + + +row("foot") + +cell returns + +cell object + +cell The initialised model. + ++h(2, "init") #{CLASSNAME}.__init__ + +tag method + +p Create a new pipeline instance. + ++aside-code("Example"). + from spacy.pipeline import #{CLASSNAME} + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell The shared vocabulary. + + +row + +cell #[code model] + +cell #[code thinc.neural.Model] or #[code True] + +cell + | The model powering the pipeline component. If no model is + | supplied, the model is created when you call + | #[code begin_training], #[code from_disk] or #[code from_bytes]. + + +row + +cell #[code **cfg] + +cell - + +cell Configuration parameters. + + +row("foot") + +cell returns + +cell #[code=CLASSNAME] + +cell The newly constructed object. + ++h(2, "call") #{CLASSNAME}.__call__ + +tag method + +p + | Apply the pipe to one document. The document is modified in place, and + | returned. Both #[code #{CLASSNAME}.__call__] and + | #[code #{CLASSNAME}.pipe] should delegate to the + | #[code #{CLASSNAME}.predict] and #[code #{CLASSNAME}.set_annotations] + | methods. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + doc = nlp(u"This is a sentence.") + processed = #{VARNAME}(doc) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The document to process. + + +row("foot") + +cell returns + +cell #[code Doc] + +cell The processed document. + ++h(2, "pipe") #{CLASSNAME}.pipe + +tag method + +p + | Apply the pipe to a stream of documents. Both + | #[code #{CLASSNAME}.__call__] and #[code #{CLASSNAME}.pipe] should + | delegate to the #[code #{CLASSNAME}.predict] and + | #[code #{CLASSNAME}.set_annotations] methods. + ++aside-code("Example"). + texts = [u'One doc', u'...', u'Lots of docs'] + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + for doc in #{VARNAME}.pipe(texts, batch_size=50): + pass + ++table(["Name", "Type", "Description"]) + +row + +cell #[code stream] + +cell iterable + +cell A stream of documents. + + +row + +cell #[code batch_size] + +cell int + +cell The number of texts to buffer. Defaults to #[code 128]. + + +row + +cell #[code n_threads] + +cell int + +cell + | The number of worker threads to use. If #[code -1], OpenMP will + | decide how many to use at run time. Default is #[code -1]. + + +row("foot") + +cell yields + +cell #[code Doc] + +cell Processed documents in the order of the original text. + ++h(2, "predict") #{CLASSNAME}.predict + +tag method + +p + | Apply the pipeline's model to a batch of docs, without modifying them. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + scores = #{VARNAME}.predict([doc1, doc2]) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code docs] + +cell iterable + +cell The documents to predict. + + +row("foot") + +cell returns + +cell - + +cell Scores from the model. + ++h(2, "set_annotations") #{CLASSNAME}.set_annotations + +tag method + +p + | Modify a batch of documents, using pre-computed scores. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + scores = #{VARNAME}.predict([doc1, doc2]) + #{VARNAME}.set_annotations([doc1, doc2], scores) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code docs] + +cell iterable + +cell The documents to modify. + + +row + +cell #[code scores] + +cell - + +cell The scores to set, produced by #[code #{CLASSNAME}.predict]. + ++h(2, "update") #{CLASSNAME}.update + +tag method + +p + | Learn from a batch of documents and gold-standard information, updating + | the pipe's model. Delegates to #[code #{CLASSNAME}.predict] and + | #[code #{CLASSNAME}.get_loss]. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + losses = {} + optimizer = nlp.begin_training() + #{VARNAME}.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code docs] + +cell iterable + +cell A batch of documents to learn from. + + +row + +cell #[code golds] + +cell iterable + +cell The gold-standard data. Must have the same length as #[code docs]. + + +row + +cell #[code drop] + +cell int + +cell The dropout rate. + + +row + +cell #[code sgd] + +cell callable + +cell + | The optimizer. Should take two arguments #[code weights] and + | #[code gradient], and an optional ID. + + +row + +cell #[code losses] + +cell dict + +cell + | Optional record of the loss during training. The value keyed by + | the model's name is updated. + ++h(2, "get_loss") #{CLASSNAME}.get_loss + +tag method + +p + | Find the loss and gradient of loss for the batch of documents and their + | predicted scores. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + scores = #{VARNAME}.predict([doc1, doc2]) + loss, d_loss = #{VARNAME}.get_loss([doc1, doc2], [gold1, gold2], scores) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code docs] + +cell iterable + +cell The batch of documents. + + +row + +cell #[code golds] + +cell iterable + +cell The gold-standard data. Must have the same length as #[code docs]. + + +row + +cell #[code scores] + +cell - + +cell Scores representing the model's predictions. + + +row("foot") + +cell returns + +cell tuple + +cell The loss and the gradient, i.e. #[code (loss, gradient)]. + ++h(2, "begin_training") #{CLASSNAME}.begin_training + +tag method + +p + | Initialize the pipe for training, using data exampes if available. If no + | model has been initialized yet, the model is added. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + nlp.pipeline.append(#{VARNAME}) + #{VARNAME}.begin_training(pipeline=nlp.pipeline) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code gold_tuples] + +cell iterable + +cell + | Optional gold-standard annotations from which to construct + | #[+api("goldparse") #[code GoldParse]] objects. + + +row + +cell #[code pipeline] + +cell list + +cell + | Optional list of #[+api("pipe") #[code Pipe]] components that + | this component is part of. + ++h(2, "use_params") #{CLASSNAME}.use_params + +tag method + +tag contextmanager + +p Modify the pipe's model, to use the given parameter values. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + with #{VARNAME}.use_params(): + #{VARNAME}.to_disk('/best_model') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code params] + +cell - + +cell + | The parameter values to use in the model. At the end of the + | context, the original parameters are restored. + ++h(2, "add_label") #{CLASSNAME}.add_label + +tag method + +p Add a new label to the pipe. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + #{VARNAME}.add_label('MY_LABEL') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code label] + +cell unicode + +cell The label to add. + ++h(2, "to_disk") #{CLASSNAME}.to_disk + +tag method + +p Serialize the pipe to disk. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + #{VARNAME}.to_disk('/path/to/#{VARNAME}') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + ++h(2, "from_disk") #{CLASSNAME}.from_disk + +tag method + +p Load the pipe from disk. Modifies the object in place and returns it. + ++aside-code("Example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + #{VARNAME}.from_disk('/path/to/#{VARNAME}') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + + +row("foot") + +cell returns + +cell #[code=CLASSNAME] + +cell The modified #[code=CLASSNAME] object. + ++h(2, "to_bytes") #{CLASSNAME}.to_bytes + +tag method + ++aside-code("example"). + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + #{VARNAME}_bytes = #{VARNAME}.to_bytes() + +p Serialize the pipe to a bytestring. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being serialized. + + +row("foot") + +cell returns + +cell bytes + +cell The serialized form of the #[code=CLASSNAME] object. + ++h(2, "from_bytes") #{CLASSNAME}.from_bytes + +tag method + +p Load the pipe from a bytestring. Modifies the object in place and returns it. + ++aside-code("Example"). + #{VARNAME}_bytes = #{VARNAME}.to_bytes() + #{VARNAME} = #{CLASSNAME}(nlp.vocab) + #{VARNAME}.from_bytes(#{VARNAME}_bytes) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code bytes_data] + +cell bytes + +cell The data to load from. + + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being loaded. + + +row("foot") + +cell returns + +cell #[code=CLASSNAME] + +cell The #[code=CLASSNAME] object. diff --git a/website/docs/api/span.jade b/website/api/span.jade similarity index 60% rename from website/docs/api/span.jade rename to website/api/span.jade index 72821ab04..266518076 100644 --- a/website/docs/api/span.jade +++ b/website/api/span.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > SPAN -include ../../_includes/_mixins +include ../_includes/_mixins p A slice from a #[+api("doc") #[code Doc]] object. @@ -40,7 +40,7 @@ p Create a Span object from the #[code slice doc[start : end]]. +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A meaning representation of the span. - +footrow + +row("foot") +cell returns +cell #[code Span] +cell The newly constructed object. @@ -61,7 +61,7 @@ p Get a #[code Token] object. +cell int +cell The index of the token within the span. - +footrow + +row("foot") +cell returns +cell #[code Token] +cell The token at #[code span[i]]. @@ -79,7 +79,7 @@ p Get a #[code Span] object. +cell tuple +cell The slice of the span to get. - +footrow + +row("foot") +cell returns +cell #[code Span] +cell The span at #[code span[start : end]]. @@ -95,7 +95,7 @@ p Iterate over #[code Token] objects. assert [t.text for t in span] == ['it', 'back', '!'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A #[code Token] object. @@ -111,11 +111,114 @@ p Get the number of tokens in the span. assert len(span) == 3 +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of tokens in the span. ++h(2, "set_extension") Span.set_extension + +tag classmethod + +tag-new(2) + +p + | Define a custom attribute on the #[code Span] which becomes available via + | #[code Span._]. For details, see the documentation on + | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. + ++aside-code("Example"). + from spacy.tokens import Span + city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin') + Span.set_extension('has_city', getter=city_getter) + doc = nlp(u'I like New York in Autumn') + assert doc[1:4]._.has_city + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Name of the attribute to set by the extension. For example, + | #[code 'my_attr'] will be available as #[code span._.my_attr]. + + +row + +cell #[code default] + +cell - + +cell + | Optional default value of the attribute if no getter or method + | is defined. + + +row + +cell #[code method] + +cell callable + +cell + | Set a custom method on the object, for example + | #[code span._.compare(other_span)]. + + +row + +cell #[code getter] + +cell callable + +cell + | Getter function that takes the object and returns an attribute + | value. Is called when the user accesses the #[code ._] attribute. + + +row + +cell #[code setter] + +cell callable + +cell + | Setter function that takes the #[code Span] and a value, and + | modifies the object. Is called when the user writes to the + | #[code Span._] attribute. + ++h(2, "get_extension") Span.get_extension + +tag classmethod + +tag-new(2) + +p + | Look up a previously registered extension by name. Returns a 4-tuple + | #[code.u-break (default, method, getter, setter)] if the extension is + | registered. Raises a #[code KeyError] otherwise. + ++aside-code("Example"). + from spacy.tokens import Span + Span.set_extension('is_city', default=False) + extension = Span.get_extension('is_city') + assert extension == (False, None, None, None) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension. + + +row("foot") + +cell returns + +cell tuple + +cell + | A #[code.u-break (default, method, getter, setter)] tuple of the + | extension. + ++h(2, "has_extension") Span.has_extension + +tag classmethod + +tag-new(2) + +p Check whether an extension has been registered on the #[code Span] class. + ++aside-code("Example"). + from spacy.tokens import Span + Span.set_extension('is_city', default=False) + assert Span.has_extension('is_city') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the extension has been registered. + +h(2, "similarity") Span.similarity +tag method +tag-model("vectors") @@ -140,11 +243,33 @@ p | The object to compare with. By default, accepts #[code Doc], | #[code Span], #[code Token] and #[code Lexeme] objects. - +footrow + +row("foot") +cell returns +cell float +cell A scalar similarity score. Higher is more similar. ++h(2, "get_lca_matrix") Span.get_lca_matrix + +tag method + +p + | Calculates the lowest common ancestor matrix for a given #[code Span]. + | Returns LCA matrix containing the integer index of the ancestor, or + | #[code -1] if no common ancestor is found, e.g. if span excludes a + | necessary ancestor. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn') + span = doc[1:4] + matrix = span.get_lca_matrix() + # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell The lowest common ancestor matrix of the #[code Span]. + + +h(2, "to_array") Span.to_array +tag method +tag-new(2) @@ -167,7 +292,7 @@ p +cell list +cell A list of attribute ID ints. - +footrow + +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[long, ndim=2]] +cell @@ -181,7 +306,7 @@ p Retokenize the document, such that the span is merged into a single token. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') - span = doc[2:3] + span = doc[2:4] span.merge() assert len(doc) == 6 assert doc[2].text == 'New York' @@ -194,11 +319,30 @@ p Retokenize the document, such that the span is merged into a single token. | Attributes to assign to the merged token. By default, attributes | are inherited from the syntactic root token of the span. - +footrow + +row("foot") +cell returns +cell #[code Token] +cell The newly merged token. ++h(2, "as_doc") Span.as_doc + +p + | Create a #[code Doc] object view of the #[code Span]'s data. Mostly + | useful for C-typed interfaces. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + span = doc[2:4] + doc2 = span.as_doc() + assert doc2.text == 'New York' + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code Doc] + +cell A #[code Doc] object of the #[code Span]'s content. + + +h(2, "root") Span.root +tag property +tag-model("parse") @@ -216,7 +360,7 @@ p assert new_york.root.text == 'York' +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell #[code Token] +cell The root token. @@ -225,7 +369,7 @@ p +tag property +tag-model("parse") -p Tokens that are to the left of the span, whose head is within the span. +p Tokens that are to the left of the span, whose heads are within the span. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') @@ -233,7 +377,7 @@ p Tokens that are to the left of the span, whose head is within the span. assert lefts == [u'New'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A left-child of a token of the span. @@ -242,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span. +tag property +tag-model("parse") -p Tokens that are to the right of the span, whose head is within the span. +p Tokens that are to the right of the span, whose heads are within the span. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') @@ -250,11 +394,47 @@ p Tokens that are to the right of the span, whose head is within the span. assert rights == [u'in'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A right-child of a token of the span. ++h(2, "n_lefts") Span.n_lefts + +tag property + +tag-model("parse") + +p + | The number of tokens that are to the left of the span, whose heads are + | within the span. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3:7].n_lefts == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of left-child tokens. + ++h(2, "n_rights") Span.n_rights + +tag property + +tag-model("parse") + +p + | The number of tokens that are to the right of the span, whose heads are + | within the span. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[2:4].n_rights == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of right-child tokens. + +h(2, "subtree") Span.subtree +tag property +tag-model("parse") @@ -267,7 +447,7 @@ p Tokens that descend from tokens in the span, but fall outside it. assert subtree == [u'Give', u'it', u'back', u'!'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A descendant of a token within the span. @@ -285,7 +465,7 @@ p assert doc[1:].has_vector +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bool +cell Whether the span has a vector data attached. @@ -304,7 +484,7 @@ p assert doc[1:].vector.shape == (300,) +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the span's semantics. @@ -323,7 +503,7 @@ p assert doc[1:].vector_norm != doc[2:].vector_norm +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell float +cell The L2 norm of the vector representation. @@ -373,6 +553,18 @@ p | The text content of the span with a trailing whitespace character | if the last token has one. + +row + +cell #[code orth] + +cell int + +cell ID of the verbatim text content. + + +row + +cell #[code orth_] + +cell unicode + +cell + | Verbatim text content (identical to #[code Span.text]). Existst + | mostly for consistency with the other attributes. + +row +cell #[code label] +cell int @@ -397,3 +589,17 @@ p +cell #[code ent_id_] +cell unicode +cell The string ID of the named entity the token is an instance of. + + +row + +cell #[code sentiment] + +cell float + +cell + | A scalar value indicating the positivity or negativity of the + | span. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/docs/api/stringstore.jade b/website/api/stringstore.jade similarity index 96% rename from website/docs/api/stringstore.jade rename to website/api/stringstore.jade index c17fb1db9..9d03404cc 100644 --- a/website/docs/api/stringstore.jade +++ b/website/api/stringstore.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > STRINGSTORE -include ../../_includes/_mixins +include ../_includes/_mixins p | Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values @@ -23,7 +23,7 @@ p +cell iterable +cell A sequence of unicode strings to add to the store. - +footrow + +row("foot") +cell returns +cell #[code StringStore] +cell The newly constructed object. @@ -38,7 +38,7 @@ p Get the number of strings in the store. assert len(stringstore) == 2 +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of strings in the store. @@ -60,7 +60,7 @@ p Retrieve a string from a given hash, or vice versa. +cell bytes, unicode or uint64 +cell The value to encode. - +footrow + +row("foot") +cell returns +cell unicode or int +cell The value to be retrieved. @@ -81,7 +81,7 @@ p Check whether a string is in the store. +cell unicode +cell The string to check. - +footrow + +row("foot") +cell returns +cell bool +cell Whether the store contains the string. @@ -100,7 +100,7 @@ p assert all_strings == [u'apple', u'orange'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell unicode +cell A string in the store. @@ -125,7 +125,7 @@ p Add a string to the #[code StringStore]. +cell unicode +cell The string to add. - +footrow + +row("foot") +cell returns +cell uint64 +cell The string's hash value. @@ -166,7 +166,7 @@ p Loads state from a directory. Modifies the object in place and returns it. | A path to a directory. Paths may be either strings or | #[code Path]-like objects. - +footrow + +row("foot") +cell returns +cell #[code StringStore] +cell The modified #[code StringStore] object. @@ -185,7 +185,7 @@ p Serialize the current state to a binary string. +cell - +cell Named attributes to prevent from being serialized. - +footrow + +row("foot") +cell returns +cell bytes +cell The serialized form of the #[code StringStore] object. @@ -211,7 +211,7 @@ p Load state from a binary string. +cell - +cell Named attributes to prevent from being loaded. - +footrow + +row("foot") +cell returns +cell #[code StringStore] +cell The #[code StringStore] object. @@ -233,7 +233,7 @@ p Get a 64-bit hash for a given string. +cell unicode +cell The string to hash. - +footrow + +row("foot") +cell returns +cell uint64 +cell The hash. diff --git a/website/api/tagger.jade b/website/api/tagger.jade new file mode 100644 index 000000000..7a7e9214f --- /dev/null +++ b/website/api/tagger.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > API > TAGGER + +include ../_includes/_mixins + +//- This class inherits from Pipe, so this page uses the template in pipe.jade. +!=partial("pipe", { subclass: "Tagger", pipeline_id: "tagger" }) diff --git a/website/api/tensorizer.jade b/website/api/tensorizer.jade new file mode 100644 index 000000000..cc79f36e3 --- /dev/null +++ b/website/api/tensorizer.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > API > TENSORIZER + +include ../_includes/_mixins + +//- This class inherits from Pipe, so this page uses the template in pipe.jade. +!=partial("pipe", { subclass: "Tensorizer", pipeline_id: "tensorizer" }) diff --git a/website/api/textcategorizer.jade b/website/api/textcategorizer.jade new file mode 100644 index 000000000..a9684b15d --- /dev/null +++ b/website/api/textcategorizer.jade @@ -0,0 +1,20 @@ +//- 💫 DOCS > API > TEXTCATEGORIZER + +include ../_includes/_mixins + +p + | The model supports classification with multiple, non-mutually exclusive + | labels. You can change the model architecture rather easily, but by + | default, the #[code TextCategorizer] class uses a convolutional + | neural network to assign position-sensitive vectors to each word in the + | document. This step is similar to the #[+api("tensorizer") #[code Tensorizer]] + | component, but the #[code TextCategorizer] uses its own CNN model, to + | avoid sharing weights with the other pipeline components. The document + | tensor is then + | summarized by concatenating max and mean pooling, and a multilayer + | perceptron is used to predict an output vector of length #[code nr_class], + | before a logistic activation is applied elementwise. The value of each + | output neuron is the probability that some class is present. + +//- This class inherits from Pipe, so this page uses the template in pipe.jade. +!=partial("pipe", { subclass: "TextCategorizer", short: "textcat", pipeline_id: "textcat" }) diff --git a/website/docs/api/token.jade b/website/api/token.jade similarity index 63% rename from website/docs/api/token.jade rename to website/api/token.jade index db445d09b..f8fa15fe8 100644 --- a/website/docs/api/token.jade +++ b/website/api/token.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > TOKEN -include ../../_includes/_mixins +include ../_includes/_mixins p An individual token — i.e. a word, punctuation symbol, whitespace, etc. @@ -30,7 +30,7 @@ p Construct a #[code Token] object. +cell int +cell The index of the token within the document. - +footrow + +row("foot") +cell returns +cell #[code Token] +cell The newly constructed object. @@ -46,11 +46,114 @@ p The number of unicode characters in the token, i.e. #[code token.text]. assert len(token) == 4 +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of unicode characters in the token. ++h(2, "set_extension") Token.set_extension + +tag classmethod + +tag-new(2) + +p + | Define a custom attribute on the #[code Token] which becomes available + | via #[code Token._]. For details, see the documentation on + | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. + ++aside-code("Example"). + from spacy.tokens import Token + fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana') + Token.set_extension('is_fruit', getter=fruit_getter) + doc = nlp(u'I have an apple') + assert doc[3]._.is_fruit + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Name of the attribute to set by the extension. For example, + | #[code 'my_attr'] will be available as #[code token._.my_attr]. + + +row + +cell #[code default] + +cell - + +cell + | Optional default value of the attribute if no getter or method + | is defined. + + +row + +cell #[code method] + +cell callable + +cell + | Set a custom method on the object, for example + | #[code token._.compare(other_token)]. + + +row + +cell #[code getter] + +cell callable + +cell + | Getter function that takes the object and returns an attribute + | value. Is called when the user accesses the #[code ._] attribute. + + +row + +cell #[code setter] + +cell callable + +cell + | Setter function that takes the #[code Token] and a value, and + | modifies the object. Is called when the user writes to the + | #[code Token._] attribute. + ++h(2, "get_extension") Token.get_extension + +tag classmethod + +tag-new(2) + +p + | Look up a previously registered extension by name. Returns a 4-tuple + | #[code.u-break (default, method, getter, setter)] if the extension is + | registered. Raises a #[code KeyError] otherwise. + ++aside-code("Example"). + from spacy.tokens import Token + Token.set_extension('is_fruit', default=False) + extension = Token.get_extension('is_fruit') + assert extension == (False, None, None, None) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension. + + +row("foot") + +cell returns + +cell tuple + +cell + | A #[code.u-break (default, method, getter, setter)] tuple of the + | extension. + ++h(2, "has_extension") Token.has_extension + +tag classmethod + +tag-new(2) + +p Check whether an extension has been registered on the #[code Token] class. + ++aside-code("Example"). + from spacy.tokens import Token + Token.set_extension('is_fruit', default=False) + assert Token.has_extension('is_fruit') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the extension has been registered. + +h(2, "check_flag") Token.check_flag +tag method @@ -68,7 +171,7 @@ p Check the value of a boolean flag. +cell int +cell The attribute ID of the flag to check. - +footrow + +row("foot") +cell returns +cell bool +cell Whether the flag is set. @@ -93,7 +196,7 @@ p Compute a semantic similarity estimate. Defaults to cosine over vectors. | The object to compare with. By default, accepts #[code Doc], | #[code Span], #[code Token] and #[code Lexeme] objects. - +footrow + +row("foot") +cell returns +cell float +cell A scalar similarity score. Higher is more similar. @@ -114,7 +217,7 @@ p Get a neighboring token. +cell int +cell The relative position of the token to get. Defaults to #[code 1]. - +footrow + +row("foot") +cell returns +cell #[code Token] +cell The token at position #[code self.doc[self.i+i]]. @@ -139,7 +242,7 @@ p +cell #[code Token] +cell Another token. - +footrow + +row("foot") +cell returns +cell bool +cell Whether this token is the ancestor of the descendant. @@ -158,7 +261,7 @@ p The rightmost token of this token's syntactic descendants. assert [t.text for t in he_ancestors] == [u'pleaded'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell @@ -177,7 +280,7 @@ p A sequence of coordinated tokens, including the token itself. assert [t.text for t in apples_conjuncts] == [u'oranges'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A coordinated token. @@ -194,11 +297,85 @@ p A sequence of the token's immediate syntactic children. assert [t.text for t in give_children] == [u'it', u'back', u'!'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A child token such that #[code child.head==self]. ++h(2, "lefts") Token.lefts + +tag property + +tag-model("parse") + +p + | The leftward immediate children of the word, in the syntactic dependency + | parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + lefts = [t.text for t in doc[3].lefts] + assert lefts == [u'New'] + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code Token] + +cell A left-child of the token. + ++h(2, "rights") Token.rights + +tag property + +tag-model("parse") + +p + | The rightward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + rights = [t.text for t in doc[3].rights] + assert rights == [u'in'] + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code Token] + +cell A right-child of the token. + ++h(2, "n_lefts") Token.n_lefts + +tag property + +tag-model("parse") + +p + | The number of leftward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3].n_lefts == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of left-child tokens. + ++h(2, "n_rights") Token.n_rights + +tag property + +tag-model("parse") + +p + | The number of rightward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3].n_rights == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of right-child tokens. + +h(2, "subtree") Token.subtree +tag property +tag-model("parse") @@ -211,7 +388,7 @@ p A sequence of all the token's syntactic descendents. assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!'] +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Token] +cell A descendant token such that #[code self.is_ancestor(descendant)]. @@ -230,7 +407,7 @@ p assert apples.has_vector +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell bool +cell Whether the token has a vector data attached. @@ -248,7 +425,7 @@ p A real-valued meaning representation. assert apples.vector.shape == (300,) +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the token's semantics. @@ -268,7 +445,7 @@ p The L2 norm of the token's vector representation. assert apples.vector_norm != pasta.vector_norm +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell float +cell The L2 norm of the vector representation. @@ -280,20 +457,29 @@ p The L2 norm of the token's vector representation. +cell #[code text] +cell unicode +cell Verbatim text content. + +row +cell #[code text_with_ws] +cell unicode +cell Text content, with trailing space character if present. - +row - +cell #[code whitespace] - +cell int - +cell Trailing space character if present. +row +cell #[code whitespace_] +cell unicode +cell Trailing space character if present. + +row + +cell #[code orth] + +cell int + +cell ID of the verbatim text content. + + +row + +cell #[code orth_] + +cell unicode + +cell + | Verbatim text content (identical to #[code Token.text]). Existst + | mostly for consistency with the other attributes. + +row +cell #[code vocab] +cell #[code Vocab] @@ -377,15 +563,35 @@ p The L2 norm of the token's vector representation. +cell unicode +cell Base form of the token, with no inflectional suffixes. + +row + +cell #[code norm] + +cell int + +cell + | The token's norm, i.e. a normalised form of the token text. + | Usually set in the language's + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or + | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. + + +row + +cell #[code norm_] + +cell unicode + +cell + | The token's norm, i.e. a normalised form of the token text. + | Usually set in the language's + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or + | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. + +row +cell #[code lower] +cell int - +cell Lower-case form of the token. + +cell Lowercase form of the token. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the token. + +cell + | Lowercase form of the token text. Equivalent to + | #[code Token.text.lower()]. +row +cell #[code shape] @@ -425,7 +631,9 @@ p The L2 norm of the token's vector representation. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the end of the token. Defaults to #[code N=3]. + +cell + | Length-N substring from the end of the token. Defaults to + | #[code N=3]. +row +cell #[code is_alpha] @@ -455,6 +663,13 @@ p The L2 norm of the token's vector representation. | Is the token in lowercase? Equivalent to | #[code token.text.islower()]. + +row + +cell #[code is_upper] + +cell bool + +cell + | Is the token in uppercase? Equivalent to + | #[code token.text.isupper()]. + +row +cell #[code is_title] +cell bool @@ -467,6 +682,16 @@ p The L2 norm of the token's vector representation. +cell bool +cell Is the token punctuation? + +row + +cell #[code is_left_punct] + +cell bool + +cell Is the token a left punctuation mark, e.g. #[code (]? + + +row + +cell #[code is_right_punct] + +cell bool + +cell Is the token a right punctuation mark, e.g. #[code )]? + +row +cell #[code is_space] +cell bool @@ -474,6 +699,16 @@ p The L2 norm of the token's vector representation. | Does the token consist of whitespace characters? Equivalent to | #[code token.text.isspace()]. + +row + +cell #[code is_bracket] + +cell bool + +cell Is the token a bracket? + + +row + +cell #[code is_quote] + +cell bool + +cell Is the token a quotation mark? + +row +cell #[code like_url] +cell bool @@ -533,6 +768,7 @@ p The L2 norm of the token's vector representation. +cell #[code lang] +cell int +cell Language of the parent document's vocabulary. + +row +cell #[code lang_] +cell unicode @@ -551,9 +787,30 @@ p The L2 norm of the token's vector representation. +row +cell #[code sentiment] +cell float - +cell A scalar value indicating the positivity or negativity of the token. + +cell + | A scalar value indicating the positivity or negativity of the + | token. +row +cell #[code lex_id] +cell int - +cell ID of the token's lexical type. + +cell Sequential ID of the token's lexical type. + + +row + +cell #[code rank] + +cell int + +cell + | Sequential ID of the token's lexical type, used to index into + | tables, e.g. for word vectors. + + +row + +cell #[code cluster] + +cell int + +cell Brown cluster ID. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/docs/api/tokenizer.jade b/website/api/tokenizer.jade similarity index 96% rename from website/docs/api/tokenizer.jade rename to website/api/tokenizer.jade index 196f886b7..7a8a34838 100644 --- a/website/docs/api/tokenizer.jade +++ b/website/api/tokenizer.jade @@ -1,6 +1,6 @@ //- 💫 DOCS > API > TOKENIZER -include ../../_includes/_mixins +include ../_includes/_mixins p | Segment text, and create #[code Doc] objects with the discovered segment @@ -57,7 +57,7 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text. +cell callable +cell A boolean function matching strings to be recognised as tokens. - +footrow + +row("foot") +cell returns +cell #[code Tokenizer] +cell The newly constructed object. @@ -77,7 +77,7 @@ p Tokenize a string. +cell unicode +cell The string to tokenize. - +footrow + +row("foot") +cell returns +cell #[code Doc] +cell A container for linguistic annotations. @@ -110,7 +110,7 @@ p Tokenize a stream of texts. | The number of threads to use, if the implementation supports | multi-threading. The default tokenizer is single-threaded. - +footrow + +row("foot") +cell yields +cell #[code Doc] +cell A sequence of Doc objects, in order. @@ -126,7 +126,7 @@ p Find internal split points of the string. +cell unicode +cell The string to split. - +footrow + +row("foot") +cell returns +cell list +cell @@ -147,7 +147,7 @@ p +cell unicode +cell The string to segment. - +footrow + +row("foot") +cell returns +cell int +cell The length of the prefix if present, otherwise #[code None]. @@ -165,7 +165,7 @@ p +cell unicode +cell The string to segment. - +footrow + +row("foot") +cell returns +cell int / #[code None] +cell The length of the suffix if present, otherwise #[code None]. @@ -176,7 +176,7 @@ p p | Add a special-case tokenization rule. This mechanism is also used to add | custom tokenizer exceptions to the language data. See the usage guide - | on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages] + | on #[+a("/usage/adding-languages#tokenizer-exceptions") adding languages] | for more details and examples. +aside-code("Example"). diff --git a/website/api/top-level.jade b/website/api/top-level.jade new file mode 100644 index 000000000..f16daae23 --- /dev/null +++ b/website/api/top-level.jade @@ -0,0 +1,20 @@ +//- 💫 DOCS > API > TOP-LEVEL + +include ../_includes/_mixins + ++section("spacy") + //-+h(2, "spacy") spaCy + //- spacy/__init__.py + include _top-level/_spacy + ++section("displacy") + +h(2, "displacy", "spacy/displacy") displaCy + include _top-level/_displacy + ++section("util") + +h(2, "util", "spacy/util.py") Utility functions + include _top-level/_util + ++section("compat") + +h(2, "compat", "spacy/compaty.py") Compatibility functions + include _top-level/_compat diff --git a/website/api/vectors.jade b/website/api/vectors.jade new file mode 100644 index 000000000..692bd1ca8 --- /dev/null +++ b/website/api/vectors.jade @@ -0,0 +1,343 @@ +//- 💫 DOCS > API > VECTORS + +include ../_includes/_mixins + +p + | Vectors data is kept in the #[code Vectors.data] attribute, which should + | be an instance of #[code numpy.ndarray] (for CPU vectors) or + | #[code cupy.ndarray] (for GPU vectors). + ++h(2, "init") Vectors.__init__ + +tag method + +p + | Create a new vector store. To keep the vector table empty, pass + | #[code width=0]. You can also create the vector table and add + | vectors one by one, or set the vector values directly on initialisation. + ++aside-code("Example"). + from spacy.vectors import Vectors + from spacy.strings import StringStore + + empty_vectors = Vectors(StringStore()) + + vectors = Vectors([u'cat'], width=300) + vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) + + vector_table = numpy.zeros((3, 300), dtype='f') + vectors = Vectors(StringStore(), data=vector_table) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code strings] + +cell #[code StringStore] or list + +cell + | List of strings, or a #[+api("stringstore") #[code StringStore]] + | that maps strings to hash values, and vice versa. + + +row + +cell #[code width] + +cell int + +cell Number of dimensions. + + +row + +cell #[code data] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell The vector data. + + +row("foot") + +cell returns + +cell #[code Vectors] + +cell The newly created object. + ++h(2, "getitem") Vectors.__getitem__ + +tag method + +p + | Get a vector by key. If key is a string, it is hashed to an integer ID + | using the #[code Vectors.strings] table. If the integer key is not found + | in the table, a #[code KeyError] is raised. + ++aside-code("Example"). + vectors = Vectors(StringStore(), 300) + vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + cat_vector = vectors[u'cat'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code key] + +cell unicode / int + +cell The key to get the vector for. + + +row + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell The vector for the key. + ++h(2, "setitem") Vectors.__setitem__ + +tag method + +p + | Set a vector for the given key. If key is a string, it is hashed to an + | integer ID using the #[code Vectors.strings] table. + ++aside-code("Example"). + vectors = Vectors(StringStore(), 300) + vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code key] + +cell unicode / int + +cell The key to set the vector for. + + +row + +cell #[code vector] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell The vector to set. + ++h(2, "iter") Vectors.__iter__ + +tag method + +p Yield vectors from the table. + ++aside-code("Example"). + vector_table = numpy.zeros((3, 300), dtype='f') + vectors = Vectors(StringStore(), vector_table) + for vector in vectors: + print(vector) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell A vector from the table. + ++h(2, "len") Vectors.__len__ + +tag method + +p Return the number of vectors that have been assigned. + ++aside-code("Example"). + vector_table = numpy.zeros((3, 300), dtype='f') + vectors = Vectors(StringStore(), vector_table) + assert len(vectors) == 3 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of vectors in the data. + ++h(2, "contains") Vectors.__contains__ + +tag method + +p + | Check whether a key has a vector entry in the table. If key is a string, + | it is hashed to an integer ID using the #[code Vectors.strings] table. + ++aside-code("Example"). + vectors = Vectors(StringStore(), 300) + vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + assert u'cat' in vectors + ++table(["Name", "Type", "Description"]) + +row + +cell #[code key] + +cell unicode / int + +cell The key to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the key has a vector entry. + ++h(2, "add") Vectors.add + +tag method + +p + | Add a key to the table, optionally setting a vector value as well. If + | key is a string, it is hashed to an integer ID using the + | #[code Vectors.strings] table. + ++aside-code("Example"). + vectors = Vectors(StringStore(), 300) + vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code key] + +cell unicode / int + +cell The key to add. + + +row + +cell #[code vector] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell An optional vector to add. + ++h(2, "items") Vectors.items + +tag method + +p Iterate over #[code (string key, vector)] pairs, in order. + ++aside-code("Example"). + vectors = Vectors(StringStore(), 300) + vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + for key, vector in vectors.items(): + print(key, vector) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell tuple + +cell #[code (string key, vector)] pairs, in order. + ++h(2, "shape") Vectors.shape + +tag property + +p + | Get #[code (rows, dims)] tuples of number of rows and number of + | dimensions in the vector table. + ++aside-code("Example"). + vectors = Vectors(StringStore(), 300) + vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) + rows, dims = vectors.shape + assert rows == 1 + assert dims == 300 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell tuple + +cell A #[code (rows, dims)] pair. + ++h(2, "from_glove") Vectors.from_glove + +tag method + +p + | Load #[+a("https://nlp.stanford.edu/projects/glove/") GloVe] vectors from + | a directory. Assumes binary format, that the vocab is in a + | #[code vocab.txt], and that vectors are named + | #[code vectors.{size}.[fd].bin], e.g. #[code vectors.128.f.bin] for 128d + | float32 vectors, #[code vectors.300.d.bin] for 300d float64 (double) + | vectors, etc. By default GloVe outputs 64-bit vectors. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode / #[code Path] + +cell The path to load the GloVe vectors from. + ++h(2, "to_disk") Vectors.to_disk + +tag method + +p Save the current state to a directory. + ++aside-code("Example"). + vectors.to_disk('/path/to/vectors') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode / #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being saved. + ++h(2, "from_disk") Vectors.from_disk + +tag method + +p Loads state from a directory. Modifies the object in place and returns it. + ++aside-code("Example"). + vectors = Vectors(StringStore()) + vectors.from_disk('/path/to/vectors') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode / #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + + +row("foot") + +cell returns + +cell #[code Vectors] + +cell The modified #[code Vectors] object. + ++h(2, "to_bytes") Vectors.to_bytes + +tag method + +p Serialize the current state to a binary string. + ++aside-code("Example"). + vectors_bytes = vectors.to_bytes() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being serialized. + + +row("foot") + +cell returns + +cell bytes + +cell The serialized form of the #[code Vectors] object. + ++h(2, "from_bytes") Vectors.from_bytes + +tag method + +p Load state from a binary string. + ++aside-code("Example"). + fron spacy.vectors import Vectors + vectors_bytes = vectors.to_bytes() + new_vectors = Vectors(StringStore()) + new_vectors.from_bytes(vectors_bytes) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code data] + +cell bytes + +cell The data to load from. + + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being loaded. + + +row("foot") + +cell returns + +cell #[code Vectors] + +cell The #[code Vectors] object. + ++h(2, "attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code data] + +cell #[code numpy.ndarray] / #[code cupy.ndarray] + +cell + | Stored vectors data. #[code numpy] is used for CPU vectors, + | #[code cupy] for GPU vectors. + + +row + +cell #[code key2row] + +cell dict + +cell + | Dictionary mapping word hashes to rows in the + | #[code Vectors.data] table. + + +row + +cell #[code keys] + +cell #[code numpy.ndarray] + +cell + | Array keeping the keys in order, such that + | #[code keys[vectors.key2row[key]] == key] diff --git a/website/docs/api/vocab.jade b/website/api/vocab.jade similarity index 57% rename from website/docs/api/vocab.jade rename to website/api/vocab.jade index 4d3e0828a..54dd4f691 100644 --- a/website/docs/api/vocab.jade +++ b/website/api/vocab.jade @@ -1,17 +1,22 @@ //- 💫 DOCS > API > VOCAB -include ../../_includes/_mixins +include ../_includes/_mixins p - | A lookup table that allows you to access #[code Lexeme] objects. The - | #[code Vocab] instance also provides access to the #[code StringStore], - | and owns underlying C-data that is shared between #[code Doc] objects. + | The #[code Vocab] object provides a lookup table that allows you to + | access #[+api("lexeme") #[code Lexeme]] objects, as well as the + | #[+api("stringstore") #[code StringStore]]. It also owns underlying + | C-data that is shared between #[code Doc] objects. +h(2, "init") Vocab.__init__ +tag method p Create the vocabulary. ++aside-code("Example"). + from spacy.vocab import Vocab + vocab = Vocab(strings=[u'hello', u'world']) + +table(["Name", "Type", "Description"]) +row +cell #[code lex_attr_getters] @@ -39,7 +44,7 @@ p Create the vocabulary. | A #[+api("stringstore") #[code StringStore]] that maps | strings to hash values, and vice versa, or a list of strings. - +footrow + +row("foot") +cell returns +cell #[code Vocab] +cell The newly constructed object. @@ -54,7 +59,7 @@ p Get the current number of lexemes in the vocabulary. assert len(nlp.vocab) > 0 +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell returns +cell int +cell The number of lexems in the vocabulary. @@ -76,7 +81,7 @@ p +cell int / unicode +cell The hash value of a word, or its unicode string. - +footrow + +row("foot") +cell returns +cell #[code Lexeme] +cell The lexeme indicated by the given ID. @@ -90,7 +95,7 @@ p Iterate over the lexemes in the vocabulary. stop_words = (lex for lex in nlp.vocab if lex.is_stop) +table(["Name", "Type", "Description"]) - +footrow + +row("foot") +cell yields +cell #[code Lexeme] +cell An entry in the vocabulary. @@ -115,7 +120,7 @@ p +cell unicode +cell The ID string. - +footrow + +row("foot") +cell returns +cell bool +cell Whether the string has an entry in the vocabulary. @@ -152,11 +157,143 @@ p | which the flag will be stored. If #[code -1], the lowest | available bit will be chosen. - +footrow + +row("foot") +cell returns +cell int +cell The integer ID by which the flag value can be checked. ++h(2, "clear_vectors") Vocab.clear_vectors + +tag method + +tag-new(2) + +p + | Drop the current vector table. Because all vectors must be the same + | width, you have to call this to change the size of the vectors. + ++aside-code("Example"). + nlp.vocab.clear_vectors(new_dim=300) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code new_dim] + +cell int + +cell + | Number of dimensions of the new vectors. If #[code None], size + | is not changed. + ++h(2, "prune_vectors") Vocab.prune_vectors + +tag method + +tag-new(2) + +p + | Reduce the current vector table to #[code nr_row] unique entries. Words + | mapped to the discarded vectors will be remapped to the closest vector + | among those remaining. For example, suppose the original table had + | vectors for the words: + | #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the + | vector table to, two rows, we would discard the vectors for "feline" + | and "reclined". These words would then be remapped to the closest + | remaining vector – so "feline" would have the same vector as "cat", + | and "reclined" would have the same vector as "sat". The similarities are + | judged by cosine. The original vectors may be large, so the cosines are + | calculated in minibatches, to reduce memory usage. + ++aside-code("Example"). + nlp.vocab.prune_vectors(10000) + assert len(nlp.vocab.vectors) <= 1000 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code nr_row] + +cell int + +cell The number of rows to keep in the vector table. + + +row + +cell #[code batch_size] + +cell int + +cell + | Batch of vectors for calculating the similarities. Larger batch + | sizes might be faster, while temporarily requiring more memory. + + +row("foot") + +cell returns + +cell dict + +cell + | A dictionary keyed by removed words mapped to + | #[code (string, score)] tuples, where #[code string] is the entry + | the removed word was mapped to, and #[code score] the similarity + | score between the two words. + ++h(2, "get_vector") Vocab.get_vector + +tag method + +tag-new(2) + +p + | Retrieve a vector for a word in the vocabulary. Words can be looked up + | by string or hash value. If no vectors data is loaded, a + | #[code ValueError] is raised. + ++aside-code("Example"). + nlp.vocab.get_vector(u'apple') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code orth] + +cell int / unicode + +cell The hash value of a word, or its unicode string. + + +row("foot") + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell + | A word vector. Size and shape are determined by the + | #[code Vocab.vectors] instance. + ++h(2, "set_vector") Vocab.set_vector + +tag method + +tag-new(2) + +p + | Set a vector for a word in the vocabulary. Words can be referenced by + | by string or hash value. + ++aside-code("Example"). + nlp.vocab.set_vector(u'apple', array([...])) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code orth] + +cell int / unicode + +cell The hash value of a word, or its unicode string. + + +row + +cell #[code vector] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell The vector to set. + ++h(2, "has_vector") Vocab.has_vector + +tag method + +tag-new(2) + +p + | Check whether a word has a vector. Returns #[code False] if no vectors + | are loaded. Words can be looked up by string or hash value. + ++aside-code("Example"). + if nlp.vocab.has_vector(u'apple'): + vector = nlp.vocab.get_vector(u'apple') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code orth] + +cell int / unicode + +cell The hash value of a word, or its unicode string. + + +row("foot") + +cell returns + +cell bool + +cell Whether the word has a vector. + +h(2, "to_disk") Vocab.to_disk +tag method +tag-new(2) @@ -192,7 +329,7 @@ p Loads state from a directory. Modifies the object in place and returns it. | A path to a directory. Paths may be either strings or | #[code Path]-like objects. - +footrow + +row("foot") +cell returns +cell #[code Vocab] +cell The modified #[code Vocab] object. @@ -211,7 +348,7 @@ p Serialize the current state to a binary string. +cell - +cell Named attributes to prevent from being serialized. - +footrow + +row("foot") +cell returns +cell bytes +cell The serialized form of the #[code Vocab] object. @@ -238,7 +375,7 @@ p Load state from a binary string. +cell - +cell Named attributes to prevent from being loaded. - +footrow + +row("foot") +cell returns +cell #[code Vocab] +cell The #[code Vocab] object. @@ -256,3 +393,14 @@ p Load state from a binary string. +cell #[code strings] +cell #[code StringStore] +cell A table managing the string-to-int mapping. + + +row + +cell #[code vectors] + +tag-new(2) + +cell #[code Vectors] + +cell A table associating word IDs to word vectors. + + +row + +cell #[code vectors_length] + +cell int + +cell Number of dimensions for each word vector. diff --git a/website/assets/css/_base/_animations.sass b/website/assets/css/_base/_animations.sass index 376ac5c2f..5c82a4fcc 100644 --- a/website/assets/css/_base/_animations.sass +++ b/website/assets/css/_base/_animations.sass @@ -19,3 +19,10 @@ to transform: translate3d(0, 0, 0) + + +//- Element rotates + +@keyframes rotate + to + transform: rotate(360deg) diff --git a/website/assets/css/_base/_fonts.sass b/website/assets/css/_base/_fonts.sass index be113798c..c1af115a7 100644 --- a/website/assets/css/_base/_fonts.sass +++ b/website/assets/css/_base/_fonts.sass @@ -1,41 +1,27 @@ //- 💫 CSS > BASE > FONTS -// Source Sans Pro +// HK Grotesk @font-face - font-family: "Source Sans Pro" + font-family: "HK Grotesk" font-style: normal - font-weight: 400 - src: url("/assets/fonts/sourcesanspro-regular.eot") - src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg") + font-weight: 500 + src: url("/assets/fonts/hkgrotesk-semibold.woff2") format("woff2"), url("/assets/fonts/hkgrotesk-semibold.woff") format("woff") @font-face - font-family: "Source Sans Pro" + font-family: "HK Grotesk" font-style: italic - font-weight: 400 - src: url("/assets/fonts/sourcesanspro-italic.eot") - src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg") + font-weight: 500 + src: url("/assets/fonts/hkgrotesk-semibolditalic.woff2") format("woff2"), url("/assets/fonts/hkgrotesk-semibolditalic.woff") format("woff") @font-face - font-family: "Source Sans Pro" - font-style: normal - font-weight: 700 - src: url("/assets/fonts/sourcesanspro-bold.eot") - src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg") - -@font-face - font-family: "Source Sans Pro" - font-style: italic - font-weight: 700 - src: url("/assets/fonts/sourcesanspro-bolditalic.eot") - src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg") - - -// Source Code Pro - -@font-face - font-family: "Source Code Pro" + font-family: "HK Grotesk" font-style: normal font-weight: 600 - src: url("/assets/fonts/sourcecodepro-semibold.eot") - src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg") + src: url("/assets/fonts/hkgrotesk-bold.woff2") format("woff2"), url("/assets/fonts/hkgrotesk-bold.woff") format("woff") + +@font-face + font-family: "HK Grotesk" + font-style: italic + font-weight: 600 + src: url("/assets/fonts/hkgrotesk-bolditalic.woff2") format("woff2"), url("/assets/fonts/hkgrotesk-bolditalic.woff") format("woff") diff --git a/website/assets/css/_base/_grid.sass b/website/assets/css/_base/_grid.sass index 3feda696d..16cf40f71 100644 --- a/website/assets/css/_base/_grid.sass +++ b/website/assets/css/_base/_grid.sass @@ -15,6 +15,15 @@ align-items: center justify-content: center + &.o-grid--vcenter + align-items: center + + &.o-grid--space + justify-content: space-between + + &.o-grid--nowrap + flex-wrap: nowrap + //- Grid column @@ -22,7 +31,6 @@ $grid-gutter: 2rem margin-top: $grid-gutter - overflow: hidden @include breakpoint(min, lg) display: flex @@ -40,6 +48,9 @@ flex: 0 0 100% flex-flow: column wrap + &.o-grid__col--no-gutter + margin-top: 0 + // Fix overflow issue in old browsers & > * diff --git a/website/assets/css/_base/_layout.sass b/website/assets/css/_base/_layout.sass index 8828651c6..1b725fdbf 100644 --- a/website/assets/css/_base/_layout.sass +++ b/website/assets/css/_base/_layout.sass @@ -12,6 +12,7 @@ body animation: fadeIn 0.25s ease background: $color-back color: $color-front + //scroll-behavior: smooth //- Paragraphs @@ -19,6 +20,9 @@ body p @extend .o-block, .u-text +p:empty + margin-bottom: 0 + //- Links diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index 635e9cde3..4e63a4346 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -43,32 +43,46 @@ position: relative padding: 2.5rem 0 overflow: auto + background: $color-subtle-light + + .o-main & + border-top-left-radius: $border-radius //- Blocks +.o-section + width: 100% + max-width: 100% + + &:not(:last-child) + margin-bottom: 7rem + padding-bottom: 4rem + border-bottom: 1px dotted $color-subtle + .o-block - margin-bottom: 3rem + margin-bottom: 4rem .o-block-small margin-bottom: 2rem -.o-no-block +.o-no-block.o-no-block margin-bottom: 0 .o-card background: $color-back - border-radius: 2px - border: 1px solid $color-subtle - padding: 3rem 2.5% - + border-radius: $border-radius + box-shadow: $box-shadow //- Box .o-box - background: $color-theme-light + background: $color-subtle-light padding: 2rem - border-left: 4px solid $color-theme + border-radius: $border-radius + +.o-box__logos + padding-bottom: 1rem //- Icons @@ -77,7 +91,20 @@ vertical-align: middle &.o-icon--inline - margin: 0 0.5rem 0 0.25rem + margin: 0 0.5rem 0 0.1rem + + &.o-icon--tag + vertical-align: bottom + height: 100% + position: relative + top: 1px + +.o-emoji + margin-right: 0.75rem + vertical-align: text-bottom + +.o-badge + border-radius: 1em //- SVG @@ -102,3 +129,38 @@ fill: currentColor vertical-align: middle margin: 0 0.5rem + + +//- Embeds + +.o-chart + max-width: 100% + +.cp_embed_iframe + border: 1px solid $color-subtle + border-radius: $border-radius + + +//- Form fields + +.o-field + background: $color-back + padding: 0 0.25em + border-radius: 2em + border: 1px solid $color-subtle + margin-bottom: 0.25rem + +.o-field__input, +.o-field__button + padding: 0 0.35em + +.o-field__input + width: 100% + +.o-field__select + background: transparent + color: $color-dark + height: 1.4em + border: none + text-align-last: center + width: 100% diff --git a/website/assets/css/_base/_reset.sass b/website/assets/css/_base/_reset.sass index 1d9d9ffbe..0ff1432d0 100644 --- a/website/assets/css/_base/_reset.sass +++ b/website/assets/css/_base/_reset.sass @@ -1,6 +1,6 @@ //- 💫 CSS > BASE > RESET -* +*, *:before, *:after box-sizing: border-box padding: 0 margin: 0 @@ -94,7 +94,10 @@ ul, ol input, button appearance: none + background: transparent button - background: transparent cursor: pointer + +progress + appearance: none diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 46c3e84d9..8c1e82706 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -2,38 +2,53 @@ //- Text +.u-text, +.u-text-small, +.u-text-tiny + font-family: $font-primary + .u-text - font: 1.5rem/#{1.55} $font-primary + font-size: 1.35rem + line-height: 1.5 .u-text-small - font: 1.4rem/#{1.375} $font-primary + font-size: 1.3rem + line-height: 1.375 .u-text-tiny - font: 1.1rem/#{1.375} $font-primary - + font-size: 1.1rem + line-height: 1.375 //- Labels & Tags .u-text-label - font: normal 600 1.4rem/#{1.5} $font-code + font: normal 600 1.4rem/#{1.5} $font-secondary text-transform: uppercase + &.u-text-label--light, &.u-text-label--dark display: inline-block + border-radius: 1em + padding: 0 1rem 0.15rem + + &.u-text-label--dark background: $color-dark box-shadow: inset 1px 1px 1px rgba($color-front, 0.25) color: $color-back - padding: 0 0.75rem margin: 1.5rem 0 0 2rem - border-radius: 2px + + &.u-text-label--light + background: $color-back + color: $color-theme + margin-bottom: 1rem .u-text-tag display: inline-block - font: 600 1.1rem/#{1} $font-code + font: 600 1.1rem/#{1} $font-secondary background: $color-theme color: $color-back - padding: 0.15em 0.25em - border-radius: 2px + padding: 0.15em 0.5em 0.35em + border-radius: 1em text-transform: uppercase vertical-align: middle @@ -45,7 +60,7 @@ //- Headings .u-heading - margin-bottom: 2rem + margin-bottom: 1em @include breakpoint(max, md) word-wrap: break-word @@ -53,12 +68,29 @@ &:not(:first-child) padding-top: 3.5rem + &.u-heading--title:after + content: "" + display: block + width: 10% + min-width: 6rem + height: 6px + background: $color-theme + margin-top: 3rem + .u-heading-0 - font: normal bold 7rem/#{1} $font-primary + font: normal 600 7rem/#{1} $font-secondary + + @include breakpoint(max, sm) + font-size: 6rem + @each $level, $size in $headings .u-heading-#{$level} - font: normal bold #{$size}rem/#{1.25} $font-primary + font: normal 500 #{$size}rem/#{1.1} $font-secondary + +.u-heading__teaser + margin-top: 2rem + font-weight: normal //- Links @@ -66,31 +98,62 @@ .u-link color: $color-theme border-bottom: 1px solid + transition: color 0.2s ease + + &:hover + color: $color-theme-dark + +.u-hide-link.u-hide-link + border: none + color: inherit + + &:hover + color: inherit .u-permalink position: relative + &:before + content: "\00b6" + font-size: 0.9em + font-weight: normal + color: $color-subtle + @include position(absolute, top, left, 0.15em, -2.85rem) + opacity: 0 + transition: opacity 0.2s ease + + &:hover:before + opacity: 1 + + &:active:before + color: $color-theme + &:target display: inline-block - padding-top: $nav-height * 1.25 - & + * - margin-top: $nav-height * 1.25 + &:before + bottom: 0.15em + top: initial -.u-permalink__icon - @include position(absolute, bottom, left, 0.35em, -2.75rem) - @include size(1.5rem) - color: $color-subtle - .u-permalink:hover & - color: $color-subtle-dark +[id]:target + padding-top: $nav-height * 1.25 - .u-permalink:active & - color: $color-theme //- Layout +.u-width-full + width: 100% + +.u-float-left + float: left + margin-right: 1rem + +.u-float-right + float: right + margin-left: 1rem + .u-text-center text-align: center @@ -104,18 +167,30 @@ padding: 0.5em 0.75em .u-padding-medium - padding: 2.5rem + padding: 1.8rem + +.u-padding-top + padding-top: 2rem .u-inline-block display: inline-block +.u-flex-full + flex: 1 + .u-nowrap white-space: nowrap +.u-wrap + white-space: pre-wrap + .u-break.u-break word-wrap: break-word white-space: initial + &.u-break--all + word-break: break-all + .u-no-border border: none @@ -123,13 +198,10 @@ border: 1px solid $color-subtle border-radius: 2px -.u-border-bottom - border: 1px solid $color-subtle - .u-border-dotted - border-top: 1px dotted $color-subtle + border-bottom: 1px dotted $color-subtle -@each $name, $color in (theme: $color-theme, subtle: $color-subtle-dark, light: $color-back, red: $color-red, green: $color-green, yellow: $color-yellow) +@each $name, $color in (theme: $color-theme, dark: $color-dark, subtle: $color-subtle-dark, light: $color-back, red: $color-red, green: $color-green, yellow: $color-yellow) .u-color-#{$name} color: $color @@ -145,6 +217,32 @@ background: $pattern +//- Loaders + +.u-loading, +[data-loading] + $spinner-size: 75px + $spinner-bar: 8px + + position: relative + + & > * + opacity: 0.35 + + &:before + @include position(absolute, top, left, 0, 0) + @include size($spinner-size) + right: 0 + bottom: 0 + margin: auto + content: "" + border: $spinner-bar solid $color-subtle + border-right: $spinner-bar solid $color-theme + border-radius: 50% + animation: rotate 1s linear infinite + z-index: 10 + + //- Hidden elements .u-hidden diff --git a/website/assets/css/_components/_asides.sass b/website/assets/css/_components/_asides.sass index d5b5c64e3..c59590c29 100644 --- a/website/assets/css/_components/_asides.sass +++ b/website/assets/css/_components/_asides.sass @@ -10,6 +10,8 @@ .c-aside__content background: $color-front + border-top-left-radius: $border-radius + border-bottom-left-radius: $border-radius z-index: 10 @include breakpoint(min, md) @@ -21,12 +23,12 @@ &:after $triangle-size: 2rem - @include position(absolute, bottom, left, -$triangle-size / 2, 0) + @include position(absolute, bottom, left, -$triangle-size / 2, $border-radius / 2) @include size(0) border-color: transparent border-style: solid border-top-color: $color-dark - border-width: $triangle-size / 2 0 0 $triangle-size + border-width: $triangle-size / 2 0 0 calc(#{$triangle-size} - #{$border-radius / 2}) content: "" @include breakpoint(max, sm) diff --git a/website/assets/css/_components/_buttons.sass b/website/assets/css/_components/_buttons.sass index f753e15bf..d3ff4b037 100644 --- a/website/assets/css/_components/_buttons.sass +++ b/website/assets/css/_components/_buttons.sass @@ -3,23 +3,50 @@ .c-button display: inline-block font-weight: bold - padding: 0.75em 1em + padding: 0.8em 1.1em 1em margin-bottom: 1px - border: 2px solid - border-radius: 2px + border: 2px solid $color-theme + border-radius: 2em text-align: center - transition: background 0.25s ease + transition: background-color, color 0.25s ease + + &:hover + border-color: $color-theme-dark + + &.c-button--small + font-size: 1.1rem + padding: 0.65rem 1.1rem 0.825rem &.c-button--primary background: $color-theme color: $color-back - border-color: $color-theme &:hover background: $color-theme-dark - border-color: $color-theme-dark &.c-button--secondary background: $color-back color: $color-theme - border-color: $color-theme + + &:hover + color: $color-theme-dark + + &.c-button--secondary-light + background: transparent + color: $color-back + border-color: $color-back + +.c-icon-button + @include size(35px) + background: $color-subtle-light + color: $color-subtle-dark + border-radius: 50% + padding: 0.5rem + transition: color 0.2s ease + + &:hover + color: $color-theme + + &.c-icon-button--right + float: right + margin-left: 3rem diff --git a/website/assets/css/_components/_chat.sass b/website/assets/css/_components/_chat.sass index 2a1e5cc3d..659f80364 100644 --- a/website/assets/css/_components/_chat.sass +++ b/website/assets/css/_components/_chat.sass @@ -24,9 +24,9 @@ transform: translateX(110%) &:before - @include position(absolute, top, left, 1rem, 2rem) + @include position(absolute, top, left, 1.25rem, 2rem) content: attr(data-title) - font: bold 1.4rem $font-code + font: bold 1.4rem $font-secondary text-transform: uppercase color: $color-back @@ -88,13 +88,18 @@ background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTguOTg0IDYuNDIybC01LjU3OCA1LjU3OCA1LjU3OCA1LjU3OC0xLjQwNiAxLjQwNi01LjU3OC01LjU3OC01LjU3OCA1LjU3OC0xLjQwNi0xLjQwNiA1LjU3OC01LjU3OC01LjU3OC01LjU3OCAxLjQwNi0xLjQwNiA1LjU3OCA1LjU3OCA1LjU3OC01LjU3OHoiPjwvcGF0aD48L3N2Zz4=) .c-chat__button - @include position(fixed, bottom, right, 0, 2rem) - padding: 1rem 1.5rem - background: $color-front + @include position(fixed, bottom, right, 1.5rem, 1.5rem) + z-index: 5 color: $color-back - border-top-left-radius: 4px - border-top-right-radius: 4px - z-index: 20 - border-color: $color-theme - border-style: solid - border-width: 1px 1px 0 1px + background: $color-front + border-radius: 1em + padding: 0.5rem 1.15rem 0.35rem + opacity: 0.7 + transition: opacity 0.2s ease + + &:hover + opacity: 1 + + +.gitter-open-chat-button + display: none diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass index 036c5358f..eaf0980e1 100644 --- a/website/assets/css/_components/_code.sass +++ b/website/assets/css/_components/_code.sass @@ -4,9 +4,9 @@ .c-code-block background: $color-front - color: $color-back + color: darken($color-back, 20) padding: 0.75em 0 - border-radius: 2px + border-radius: $border-radius overflow: auto width: 100% max-width: 100% @@ -16,6 +16,8 @@ &.c-code-block--has-icon padding: 0 display: flex + border-top-left-radius: 0 + border-bottom-left-radius: 0 .c-code-block__icon padding: 0 0 0 1rem @@ -43,17 +45,25 @@ opacity: 0.5 +//- Code + +code + -webkit-font-smoothing: subpixel-antialiased + -moz-osx-font-smoothing: auto + + //- Inline code +*:not(a):not(.c-code-block) > code + color: $color-dark + *:not(.c-code-block) > code - font: normal 600 0.8em/#{1} $font-code - background: darken($color-theme-light, 5) - box-shadow: 1px 1px 0 rgba($color-front, 0.05) - text-shadow: 1px 1px 0 rgba($color-back, 0.5) - color: $color-front - padding: 0.1em 0.5em + font-size: 90% + background-color: $color-subtle-light + padding: 0.2rem 0.4rem + border-radius: 0.25rem + font-family: $font-code margin: 0 - border-radius: 1px box-decoration-break: clone white-space: nowrap diff --git a/website/assets/css/_components/_landing.sass b/website/assets/css/_components/_landing.sass index af1521d10..4c15e4a39 100644 --- a/website/assets/css/_components/_landing.sass +++ b/website/assets/css/_components/_landing.sass @@ -2,12 +2,11 @@ .c-landing background: $color-theme - padding-top: 5rem + padding-top: $nav-height * 1.5 width: 100% .c-landing__wrapper background: $pattern - padding-bottom: 6rem width: 100% .c-landing__content @@ -15,9 +14,45 @@ width: 100% min-height: 573px +.c-landing__headlines + position: relative + top: -1.5rem + left: 1rem + .c-landing__title color: $color-back text-align: center + margin-bottom: 0.75rem + +.c-landing__blocks + @include breakpoint(min, sm) + position: relative + top: -25rem + margin-bottom: -25rem + +.c-landing__card + padding: 3rem 2.5rem + +.c-landing__banner + background: $color-theme + +.c-landing__banner__content + @include breakpoint(min, md) + border: 4px solid + padding: 1rem 6.5rem 2rem 4rem + + +.c-landing__banner__text + font-weight: 500 + + strong + font-weight: 800 + + p + font-size: 1.5rem + + @include breakpoint(min, md) + padding-top: 7rem .c-landing__badge transform: rotate(7deg) diff --git a/website/assets/css/_components/_lists.sass b/website/assets/css/_components/_lists.sass index 48a5e92c8..553af6578 100644 --- a/website/assets/css/_components/_lists.sass +++ b/website/assets/css/_components/_lists.sass @@ -9,6 +9,8 @@ .c-list__item:before content: counter(li, #{$counter}) '.' + font-size: 1em + padding-right: 1rem //- List Item @@ -21,13 +23,14 @@ &:before content: '\25CF' display: inline-block - font-size: 1em + font-size: 0.6em font-weight: bold - padding-right: 1.25rem + padding-right: 1em margin-left: -3.75rem text-align: right width: 2.5rem counter-increment: li + box-sizing: content-box //- List icon diff --git a/website/assets/css/_components/_misc.sass b/website/assets/css/_components/_misc.sass index 3bd9bd6b6..8167c94b2 100644 --- a/website/assets/css/_components/_misc.sass +++ b/website/assets/css/_components/_misc.sass @@ -3,9 +3,8 @@ .x-terminal background: $color-subtle-light color: $color-front - padding: 4px - border: 1px dotted $color-subtle - border-radius: 5px + padding: $border-radius + border-radius: 1em width: 100% .x-terminal__icons diff --git a/website/assets/css/_components/_navigation.sass b/website/assets/css/_components/_navigation.sass index 5b7275f92..2f1cfb6e3 100644 --- a/website/assets/css/_components/_navigation.sass +++ b/website/assets/css/_components/_navigation.sass @@ -1,22 +1,18 @@ //- 💫 CSS > COMPONENTS > NAVIGATION .c-nav - @include position(absolute, top, left, 0, 0) + @include position(fixed, top, left, 0, 0) @include size(100%, $nav-height) background: $color-back color: $color-theme align-items: center display: flex justify-content: space-between + flex-flow: row nowrap padding: 0 2rem 0 1rem - z-index: 20 + z-index: 30 width: 100% - border-bottom: 1px solid $color-subtle - - &.c-nav--theme - background: $color-theme - color: $color-back - border-bottom: none + box-shadow: $box-shadow &.is-fixed animation: slideInDown 0.5s ease-in-out @@ -28,12 +24,21 @@ justify-content: flex-end flex-flow: row nowrap border-color: inherit + flex: 1 .c-nav__menu__item display: flex align-items: center height: 100% text-transform: uppercase + font-family: $font-secondary + font-size: 1.6rem + font-weight: bold + color: $color-theme - &:not(:last-child) - margin-right: 1em + &:not(:first-child) + margin-left: 2em + + &.is-active + color: $color-dark + pointer-events: none diff --git a/website/assets/css/_components/_progress.sass b/website/assets/css/_components/_progress.sass new file mode 100644 index 000000000..bbab0fddd --- /dev/null +++ b/website/assets/css/_components/_progress.sass @@ -0,0 +1,24 @@ +//- 💫 CSS > COMPONENTS > PROGRESS + +.c-progress + display: block + flex: 105% + width: 105% + height: 3px + color: $color-theme + background: transparent + border: none + position: absolute + bottom: 0 + left: -2.5% + + &::-webkit-progress-bar + background: $color-back + border-radius: none + + &::-webkit-progress-value + background: $color-theme + border-radius: none + + &::-moz-progress-bar + background: $color-theme diff --git a/website/assets/css/_components/_quickstart.sass b/website/assets/css/_components/_quickstart.sass index 1e7d0761a..6b02b3128 100644 --- a/website/assets/css/_components/_quickstart.sass +++ b/website/assets/css/_components/_quickstart.sass @@ -1,14 +1,17 @@ //- 💫 CSS > COMPONENTS > QUICKSTART .c-quickstart - border: 1px solid $color-subtle - border-radius: 2px + border-radius: $border-radius display: none background: $color-subtle-light &:not([style]) + .c-quickstart__info display: none + .c-code-block + border-top-left-radius: 0 + border-top-right-radius: 0 + .c-quickstart__content padding: 2rem 3rem @@ -72,7 +75,6 @@ flex: 100% .c-quickstart__legend - color: $color-subtle-dark margin-right: 2rem padding-top: 0.75rem flex: 1 1 35% @@ -95,4 +97,4 @@ padding: 1.5rem 0 .c-quickstart__code - font-size: 1.6rem + font-size: 1.4rem diff --git a/website/assets/css/_components/_sidebar.sass b/website/assets/css/_components/_sidebar.sass index d88588341..be3e34147 100644 --- a/website/assets/css/_components/_sidebar.sass +++ b/website/assets/css/_components/_sidebar.sass @@ -3,16 +3,15 @@ //- Sidebar container .c-sidebar - background: $color-subtle-light overflow-y: auto @include breakpoint(min, md) @include position(fixed, top, left, 0, 0) - @include size($sidebar-width, 100vh) + @include size($sidebar-width, calc(100vh - 3px)) + @include scroll-shadow($color-back, $color-front, $nav-height) flex: 0 0 $sidebar-width padding: calc(#{$nav-height} + 1.5rem) 0 0 z-index: 10 - border-right: 1px solid $color-subtle @include breakpoint(max, sm) flex: 100% @@ -27,7 +26,7 @@ .c-sidebar__section & > * - padding: 0 2rem + padding: 0 2rem 0.35rem @include breakpoint(max, sm) flex: 1 1 0 @@ -38,7 +37,59 @@ &:not(:last-child) border-right: 1px solid $color-subtle - .is-active +.c-sidebar__item + color: $color-theme + + &:hover + color: $color-theme-dark + + & > .is-active font-weight: bold - color: $color-theme - background: rgba($color-subtle, 0.4) + color: $color-dark + margin-top: 1rem + + +//- Sidebar subsections + +$crumb-bullet: 14px +$crumb-bar: 2px + +.c-sidebar__crumb + display: block + padding-top: 1rem + padding-left: 1rem + position: relative + +.c-sidebar__crumb__item + margin-bottom: $crumb-bullet / 2 + position: relative + padding-left: 2rem + color: $color-theme + font-size: 1.2rem + + &:hover + color: $color-theme-dark + + &:after + @include size($crumb-bullet) + @include position(absolute, top, left, $crumb-bullet / 4, 0) + content: "" + border-radius: 50% + background: $color-theme + z-index: 10 + + &:not(:last-child):before + @include size($crumb-bar, 100%) + @include position(absolute, top, left, $crumb-bullet, ($crumb-bullet - $crumb-bar) / 2) + content: "" + background: $color-subtle + + &:first-child:before + height: calc(100% + #{$crumb-bullet * 2}) + top: -$crumb-bullet / 2 + + &.is-active + color: $color-dark + + &:after + background: $color-dark diff --git a/website/assets/css/_components/_tables.sass b/website/assets/css/_components/_tables.sass index cbc861803..99ae998ff 100644 --- a/website/assets/css/_components/_tables.sass +++ b/website/assets/css/_components/_tables.sass @@ -9,7 +9,7 @@ //- Table row .c-table__row - &:nth-child(odd) + &:nth-child(odd):not(.c-table__row--head) background: rgba($color-subtle-light, 0.35) &.c-table__row--foot @@ -38,7 +38,6 @@ .c-table__head-cell font-weight: bold color: $color-theme - background: $color-back padding: 1rem 0.5rem border-bottom: 2px solid $color-theme @@ -52,6 +51,7 @@ @include scroll-shadow-base($color-front) display: inline-block overflow-x: auto + overflow-y: hidden width: auto -webkit-overflow-scrolling: touch @@ -63,9 +63,15 @@ &:last-child @include scroll-shadow-cover(right, $color-back) + &:first-child:last-child + @include scroll-shadow-cover(both, $color-back) + .c-table__row--foot .c-table__cell &:first-child @include scroll-shadow-cover(left, lighten($color-subtle-light, 2)) &:last-child @include scroll-shadow-cover(right, lighten($color-subtle-light, 2)) + + &:first-child:last-child + @include scroll-shadow-cover(both, lighten($color-subtle-light, 2)) diff --git a/website/assets/css/_components/_tooltips.sass b/website/assets/css/_components/_tooltips.sass index e68f2875c..f9284dcdb 100644 --- a/website/assets/css/_components/_tooltips.sass +++ b/website/assets/css/_components/_tooltips.sass @@ -4,24 +4,34 @@ position: relative @include breakpoint(min, sm) + &[data-tooltip-style="code"]:before + -webkit-font-smoothing: subpixel-antialiased + -moz-osx-font-smoothing: auto + padding: 0.35em 0.85em 0.45em + font: normal 1rem/#{1.25} $font-code + white-space: nowrap + min-width: auto + &:before @include position(absolute, top, left, 125%, 50%) display: inline-block content: attr(data-tooltip) background: $color-front - border-radius: 2px + border-radius: $border-radius border: 1px solid rgba($color-subtle-dark, 0.5) color: $color-back - font: normal 1.3rem/#{1.25} $font-primary + font: normal 1.2rem/#{1.25} $font-primary text-transform: none + text-align: left opacity: 0 - padding: 0.5em 0.75em transform: translateX(-50%) translateY(-2px) transition: opacity 0.1s ease-out, transform 0.1s ease-out visibility: hidden - min-width: 200px max-width: 300px + min-width: 200px + padding: 0.75em 1em 1em z-index: 200 + white-space: pre-wrap &:hover:before opacity: 1 diff --git a/website/assets/css/_mixins.sass b/website/assets/css/_mixins.sass index e7e7a3432..d1ea9c5d5 100644 --- a/website/assets/css/_mixins.sass +++ b/website/assets/css/_mixins.sass @@ -42,19 +42,39 @@ // $scroll-shadow-side - side to cover shadow (left or right) // $scroll-shadow-background - original background color to match -@mixin scroll-shadow-base($scroll-shadow-color) - background: radial-gradient(left, ellipse, rgba(0,0,0, .2) 0%, rgba(0,0,0, 0) 75%) 0 center, radial-gradient(right, ellipse, rgba(0,0,0, .2) 0%, rgba(0,0,0, 0) 75%) 100% center +@function scroll-shadow-gradient($scroll-gradient-direction, $scroll-shadow-background) + @return linear-gradient(to #{$scroll-gradient-direction}, rgba($scroll-shadow-background, 1) 50%, rgba($scroll-shadow-background, 0) 100%) + +@mixin scroll-shadow-base($scroll-shadow-color, $scroll-shadow-intensity: 0.2) + background: radial-gradient(ellipse at 0 50%, rgba($scroll-shadow-color, $scroll-shadow-intensity) 0%, rgba(0,0,0,0) 75%) 0 center, radial-gradient(ellipse at 100% 50%, rgba($scroll-shadow-color, $scroll-shadow-intensity) 0%, transparent 75%) 100% center background-attachment: scroll, scroll background-repeat: no-repeat background-size: 10px 100%, 10px 100% @mixin scroll-shadow-cover($scroll-shadow-side, $scroll-shadow-background) $scroll-gradient-direction: right !default + background-repeat: no-repeat @if $scroll-shadow-side == right $scroll-gradient-direction: left background-position: 100% 0 - background-image: linear-gradient(to #{$scroll-gradient-direction}, rgba($scroll-shadow-background, 1) 50%, rgba($scroll-shadow-background, 0) 100%) + @if $scroll-shadow-side == both + background-image: scroll-shadow-gradient(left, $scroll-shadow-background), scroll-shadow-gradient(right, $scroll-shadow-background) + background-position: 100% 0, 0 0 + background-size: 20px 100%, 20px 100% + @else + background-image: scroll-shadow-gradient($scroll-gradient-direction, $scroll-shadow-background) + background-size: 20px 100% + +// Full vertical scroll shadows +// adapted from: https://codepen.io/laustdeleuran/pen/DBaAu + +@mixin scroll-shadow($background-color, $shadow-color, $shadow-offset: 0, $shadow-intensity: 0.4, $cover-size: 40px, $shadow-size: 15px) + background: linear-gradient($background-color 30%, rgba($background-color,0)) 0 $shadow-offset, linear-gradient(rgba($background-color,0), $background-color 70%) 0 100%, radial-gradient(50% 0, farthest-side, rgba($shadow-color,$shadow-intensity), rgba($shadow-color,0)) 0 $shadow-offset, radial-gradient(50% 100%,farthest-side, rgba($shadow-color,$shadow-intensity), rgba($shadow-color,0)) 0 100% + + background: linear-gradient($background-color 30%, rgba($background-color,0)) 0 $shadow-offset, linear-gradient(rgba($background-color,0), $background-color 70%) 0 100%, radial-gradient(farthest-side at 50% 0, rgba($shadow-color,$shadow-intensity), rgba($shadow-color,0)) -20px $shadow-offset, radial-gradient(farthest-side at 50% 100%, rgba($shadow-color, $shadow-intensity), rgba($shadow-color,0)) 0 100% background-repeat: no-repeat - background-size: 20px 100% + background-color: $background-color + background-size: 100% $cover-size, 100% $cover-size, 100% $shadow-size, 100% $shadow-size + background-attachment: local, local, scroll, scroll diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index 3ccf36f06..fbceb5a6f 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -4,47 +4,48 @@ $type-base: 11px -$nav-height: 45px +$nav-height: 55px $content-width: 1250px -$sidebar-width: 200px -$aside-width: 30vw +$sidebar-width: 235px +$aside-width: 27.5vw $aside-padding: 25px +$border-radius: 6px $logo-width: 85px $logo-height: 27px $grid: ( quarter: 4, third: 3, half: 2, two-thirds: 1.5, three-quarters: 1.33 ) $breakpoints: ( sm: 768px, md: 992px, lg: 1200px ) -$headings: (1: 3, 2: 2.6, 3: 2, 4: 1.8, 5: 1.5) - +$headings: (1: 4.4, 2: 3.4, 3: 2.6, 4: 2.2, 5: 1.8) // Fonts -$font-primary: "Source Sans Pro", Tahoma, Geneva, sans-serif !default -$font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier, monospace !default - +$font-primary: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol" !default +$font-secondary: "HK Grotesk", Roboto, Helvetica, Arial, sans-serif !default +$font-code: Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !default // Colors -$colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e ) +$colors: ( blue: #09a3d5, green: #05b083 ) $color-back: #fff !default $color-front: #1a1e23 !default $color-dark: lighten($color-front, 20) !default $color-theme: map-get($colors, $theme) -$color-theme-dark: darken(map-get($colors, $theme), 5) +$color-theme-dark: darken(map-get($colors, $theme), 10) $color-theme-light: rgba($color-theme, 0.05) $color-subtle: #ddd !default $color-subtle-light: #f6f6f6 !default $color-subtle-dark: #949e9b !default -$color-red: #d9515d -$color-green: #3ec930 +$color-red: #ef476f +$color-green: #7ddf64 $color-yellow: #f4c025 $syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #ff2c6d, regex: #f4c025 ) $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat +$box-shadow: 0 1px 5px rgba(0, 0, 0, 0.2) diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass index eaf7cdf70..47cf3f1b5 100644 --- a/website/assets/css/style.sass +++ b/website/assets/css/style.sass @@ -30,6 +30,7 @@ $theme: blue !default @import _components/lists @import _components/misc @import _components/navigation +@import _components/progress @import _components/sidebar @import _components/tables @import _components/quickstart diff --git a/website/assets/css/style_red.sass b/website/assets/css/style_red.sass deleted file mode 100644 index 83fe330b9..000000000 --- a/website/assets/css/style_red.sass +++ /dev/null @@ -1,4 +0,0 @@ -//- 💫 STYLESHEET (RED) - -$theme: red -@import style diff --git a/website/assets/fonts/hkgrotesk-bold.woff b/website/assets/fonts/hkgrotesk-bold.woff new file mode 100755 index 000000000..41e8651c3 Binary files /dev/null and b/website/assets/fonts/hkgrotesk-bold.woff differ diff --git a/website/assets/fonts/hkgrotesk-bold.woff2 b/website/assets/fonts/hkgrotesk-bold.woff2 new file mode 100755 index 000000000..1967e7825 Binary files /dev/null and b/website/assets/fonts/hkgrotesk-bold.woff2 differ diff --git a/website/assets/fonts/hkgrotesk-bolditalic.woff b/website/assets/fonts/hkgrotesk-bolditalic.woff new file mode 100755 index 000000000..dbb8e57ee Binary files /dev/null and b/website/assets/fonts/hkgrotesk-bolditalic.woff differ diff --git a/website/assets/fonts/hkgrotesk-bolditalic.woff2 b/website/assets/fonts/hkgrotesk-bolditalic.woff2 new file mode 100755 index 000000000..6e037d731 Binary files /dev/null and b/website/assets/fonts/hkgrotesk-bolditalic.woff2 differ diff --git a/website/assets/fonts/hkgrotesk-semibold.woff b/website/assets/fonts/hkgrotesk-semibold.woff new file mode 100755 index 000000000..ae0ff2dc7 Binary files /dev/null and b/website/assets/fonts/hkgrotesk-semibold.woff differ diff --git a/website/assets/fonts/hkgrotesk-semibold.woff2 b/website/assets/fonts/hkgrotesk-semibold.woff2 new file mode 100755 index 000000000..6c52d9245 Binary files /dev/null and b/website/assets/fonts/hkgrotesk-semibold.woff2 differ diff --git a/website/assets/fonts/hkgrotesk-semibolditalic.woff b/website/assets/fonts/hkgrotesk-semibolditalic.woff new file mode 100755 index 000000000..3fce4cf35 Binary files /dev/null and b/website/assets/fonts/hkgrotesk-semibolditalic.woff differ diff --git a/website/assets/fonts/hkgrotesk-semibolditalic.woff2 b/website/assets/fonts/hkgrotesk-semibolditalic.woff2 new file mode 100755 index 000000000..fc0ed71ee Binary files /dev/null and b/website/assets/fonts/hkgrotesk-semibolditalic.woff2 differ diff --git a/website/assets/fonts/sourcecodepro-semibold.eot b/website/assets/fonts/sourcecodepro-semibold.eot deleted file mode 100644 index 4ad47de2e..000000000 Binary files a/website/assets/fonts/sourcecodepro-semibold.eot and /dev/null differ diff --git a/website/assets/fonts/sourcecodepro-semibold.svg b/website/assets/fonts/sourcecodepro-semibold.svg deleted file mode 100644 index 0515c3ad5..000000000 --- a/website/assets/fonts/sourcecodepro-semibold.svg +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/website/assets/fonts/sourcecodepro-semibold.ttf b/website/assets/fonts/sourcecodepro-semibold.ttf deleted file mode 100644 index ff2206e58..000000000 Binary files a/website/assets/fonts/sourcecodepro-semibold.ttf and /dev/null differ diff --git a/website/assets/fonts/sourcecodepro-semibold.woff b/website/assets/fonts/sourcecodepro-semibold.woff deleted file mode 100644 index 83cd2c3ea..000000000 Binary files a/website/assets/fonts/sourcecodepro-semibold.woff and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bold.eot b/website/assets/fonts/sourcesanspro-bold.eot deleted file mode 100644 index b3b60be2d..000000000 Binary files a/website/assets/fonts/sourcesanspro-bold.eot and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bold.svg b/website/assets/fonts/sourcesanspro-bold.svg deleted file mode 100644 index 94efdcbe5..000000000 --- a/website/assets/fonts/sourcesanspro-bold.svg +++ /dev/null @@ -1,1031 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/website/assets/fonts/sourcesanspro-bold.ttf b/website/assets/fonts/sourcesanspro-bold.ttf deleted file mode 100644 index 4619eef6b..000000000 Binary files a/website/assets/fonts/sourcesanspro-bold.ttf and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bold.woff b/website/assets/fonts/sourcesanspro-bold.woff deleted file mode 100644 index 3257aeddf..000000000 Binary files a/website/assets/fonts/sourcesanspro-bold.woff and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bold.woff2 b/website/assets/fonts/sourcesanspro-bold.woff2 deleted file mode 100644 index 42b02574f..000000000 Binary files a/website/assets/fonts/sourcesanspro-bold.woff2 and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bolditalic.eot b/website/assets/fonts/sourcesanspro-bolditalic.eot deleted file mode 100644 index da1580939..000000000 Binary files a/website/assets/fonts/sourcesanspro-bolditalic.eot and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bolditalic.svg b/website/assets/fonts/sourcesanspro-bolditalic.svg deleted file mode 100644 index aa37571dd..000000000 --- a/website/assets/fonts/sourcesanspro-bolditalic.svg +++ /dev/null @@ -1,840 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/website/assets/fonts/sourcesanspro-bolditalic.ttf b/website/assets/fonts/sourcesanspro-bolditalic.ttf deleted file mode 100644 index ae8b08166..000000000 Binary files a/website/assets/fonts/sourcesanspro-bolditalic.ttf and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bolditalic.woff b/website/assets/fonts/sourcesanspro-bolditalic.woff deleted file mode 100644 index 3ac22abd8..000000000 Binary files a/website/assets/fonts/sourcesanspro-bolditalic.woff and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-bolditalic.woff2 b/website/assets/fonts/sourcesanspro-bolditalic.woff2 deleted file mode 100644 index 629413ac6..000000000 Binary files a/website/assets/fonts/sourcesanspro-bolditalic.woff2 and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-italic.eot b/website/assets/fonts/sourcesanspro-italic.eot deleted file mode 100644 index a5d050e75..000000000 Binary files a/website/assets/fonts/sourcesanspro-italic.eot and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-italic.svg b/website/assets/fonts/sourcesanspro-italic.svg deleted file mode 100644 index bf0f85da9..000000000 --- a/website/assets/fonts/sourcesanspro-italic.svg +++ /dev/null @@ -1,852 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/website/assets/fonts/sourcesanspro-italic.ttf b/website/assets/fonts/sourcesanspro-italic.ttf deleted file mode 100644 index f17a12856..000000000 Binary files a/website/assets/fonts/sourcesanspro-italic.ttf and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-italic.woff b/website/assets/fonts/sourcesanspro-italic.woff deleted file mode 100644 index 32c1e1962..000000000 Binary files a/website/assets/fonts/sourcesanspro-italic.woff and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-italic.woff2 b/website/assets/fonts/sourcesanspro-italic.woff2 deleted file mode 100644 index c3d399f8c..000000000 Binary files a/website/assets/fonts/sourcesanspro-italic.woff2 and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-regular.eot b/website/assets/fonts/sourcesanspro-regular.eot deleted file mode 100644 index 6e98cf79a..000000000 Binary files a/website/assets/fonts/sourcesanspro-regular.eot and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-regular.svg b/website/assets/fonts/sourcesanspro-regular.svg deleted file mode 100644 index 27d435ad9..000000000 --- a/website/assets/fonts/sourcesanspro-regular.svg +++ /dev/null @@ -1,1039 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/website/assets/fonts/sourcesanspro-regular.ttf b/website/assets/fonts/sourcesanspro-regular.ttf deleted file mode 100644 index 0bb505790..000000000 Binary files a/website/assets/fonts/sourcesanspro-regular.ttf and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-regular.woff b/website/assets/fonts/sourcesanspro-regular.woff deleted file mode 100644 index aa0503cac..000000000 Binary files a/website/assets/fonts/sourcesanspro-regular.woff and /dev/null differ diff --git a/website/assets/fonts/sourcesanspro-regular.woff2 b/website/assets/fonts/sourcesanspro-regular.woff2 deleted file mode 100644 index 06206e483..000000000 Binary files a/website/assets/fonts/sourcesanspro-regular.woff2 and /dev/null differ diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/architecture.svg similarity index 91% rename from website/assets/img/docs/architecture.svg rename to website/assets/img/architecture.svg index c1d12d79b..911aaec60 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/architecture.svg @@ -1,9 +1,13 @@ Language @@ -14,37 +18,37 @@ - nlp.vocab.morphology + nlp.vocab.morphology Vocab - nlp.vocab + nlp.vocab StringStore - nlp.vocab.strings + nlp.vocab.strings - nlp.tokenizer.vocab + nlp.tokenizer.vocab Tokenizer - nlp.make_doc() + nlp.make_doc() - nlp.pipeline + nlp.pipeline - nlp.pipeline[i].vocab + nlp.pipeline[i].vocab pt @@ -80,7 +84,7 @@ - doc.vocab + doc.vocab @@ -94,7 +98,7 @@ - token.doc + token.doc Token @@ -102,7 +106,7 @@ - lexeme.vocab + lexeme.vocab Lexeme @@ -112,7 +116,7 @@ - span.doc + span.doc Dependency Parser diff --git a/website/assets/img/docs/displacy_jupyter.jpg b/website/assets/img/displacy_jupyter.jpg similarity index 100% rename from website/assets/img/docs/displacy_jupyter.jpg rename to website/assets/img/displacy_jupyter.jpg diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg deleted file mode 100644 index a449c3d04..000000000 --- a/website/assets/img/graphics.svg +++ /dev/null @@ -1,84 +0,0 @@ - - - - spaCy v2.0.0 alpha - - - - - - - - - - - spaCy user survey 2017 - - - - - - - - - - - - brain - - - - - - - computer - - - - - - - - - - eye - - - - - - - - - - - - - - bubble - - - - - - - - - - - - spacy - - - - - explosion - - - - - matt-signature - - - - diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg deleted file mode 100644 index 104117cc0..000000000 --- a/website/assets/img/icons.svg +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/website/assets/img/docs/language_data.svg b/website/assets/img/language_data.svg similarity index 88% rename from website/assets/img/docs/language_data.svg rename to website/assets/img/language_data.svg index 31e1a1b29..e24bb7809 100644 --- a/website/assets/img/docs/language_data.svg +++ b/website/assets/img/language_data.svg @@ -1,13 +1,16 @@ - Tokenizer + Tokenizer @@ -33,50 +36,50 @@ - Language data + Language data - stop words + stop words - lexical attributes + lexical attributes - tokenizer exceptions + tokenizer exceptions - prefixes, suffixes, infixes + prefixes, suffixes, infixes - lemma data + lemma data - Lemmatizer + Lemmatizer - char classes + char classes Token - morph rules + morph rules - tag map + tag map Morphology diff --git a/website/assets/img/logo.svg b/website/assets/img/logo.svg index fc776fb82..89b61e132 100644 --- a/website/assets/img/logo.svg +++ b/website/assets/img/logo.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/website/assets/img/logos/chartbeat.png b/website/assets/img/logos/chartbeat.png deleted file mode 100644 index 40e644154..000000000 Binary files a/website/assets/img/logos/chartbeat.png and /dev/null differ diff --git a/website/assets/img/logos/chattermill.png b/website/assets/img/logos/chattermill.png deleted file mode 100644 index 6f98359d5..000000000 Binary files a/website/assets/img/logos/chattermill.png and /dev/null differ diff --git a/website/assets/img/logos/cytora.png b/website/assets/img/logos/cytora.png deleted file mode 100644 index 24040dc4f..000000000 Binary files a/website/assets/img/logos/cytora.png and /dev/null differ diff --git a/website/assets/img/logos/duedil.png b/website/assets/img/logos/duedil.png deleted file mode 100644 index 0eb518fb4..000000000 Binary files a/website/assets/img/logos/duedil.png and /dev/null differ diff --git a/website/assets/img/logos/indico.png b/website/assets/img/logos/indico.png deleted file mode 100644 index eb840a431..000000000 Binary files a/website/assets/img/logos/indico.png and /dev/null differ diff --git a/website/assets/img/logos/kip.png b/website/assets/img/logos/kip.png deleted file mode 100644 index e7234b438..000000000 Binary files a/website/assets/img/logos/kip.png and /dev/null differ diff --git a/website/assets/img/logos/quora.png b/website/assets/img/logos/quora.png deleted file mode 100644 index 07c38144c..000000000 Binary files a/website/assets/img/logos/quora.png and /dev/null differ diff --git a/website/assets/img/logos/signaln.png b/website/assets/img/logos/signaln.png deleted file mode 100644 index 078f50380..000000000 Binary files a/website/assets/img/logos/signaln.png and /dev/null differ diff --git a/website/assets/img/logos/socrata.png b/website/assets/img/logos/socrata.png deleted file mode 100644 index 079872c4e..000000000 Binary files a/website/assets/img/logos/socrata.png and /dev/null differ diff --git a/website/assets/img/logos/stitchfix.png b/website/assets/img/logos/stitchfix.png deleted file mode 100644 index 0a97057c5..000000000 Binary files a/website/assets/img/logos/stitchfix.png and /dev/null differ diff --git a/website/assets/img/logos/synapsify.png b/website/assets/img/logos/synapsify.png deleted file mode 100644 index 3c65d9b32..000000000 Binary files a/website/assets/img/logos/synapsify.png and /dev/null differ diff --git a/website/assets/img/logos/turi.png b/website/assets/img/logos/turi.png deleted file mode 100644 index de70d08bd..000000000 Binary files a/website/assets/img/logos/turi.png and /dev/null differ diff --git a/website/assets/img/logos/wayblazer.png b/website/assets/img/logos/wayblazer.png deleted file mode 100644 index 95a0098e4..000000000 Binary files a/website/assets/img/logos/wayblazer.png and /dev/null differ diff --git a/website/assets/img/logos/wonderflow.png b/website/assets/img/logos/wonderflow.png deleted file mode 100644 index d3fec37c0..000000000 Binary files a/website/assets/img/logos/wonderflow.png and /dev/null differ diff --git a/website/assets/img/pattern_green.jpg b/website/assets/img/pattern_green.jpg index d2e341822..172495c38 100644 Binary files a/website/assets/img/pattern_green.jpg and b/website/assets/img/pattern_green.jpg differ diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/pipeline.svg similarity index 92% rename from website/assets/img/docs/pipeline.svg rename to website/assets/img/pipeline.svg index 9c34636dc..1ff5923cb 100644 --- a/website/assets/img/docs/pipeline.svg +++ b/website/assets/img/pipeline.svg @@ -1,8 +1,8 @@ diff --git a/website/assets/img/showcase/displacy-ent.jpg b/website/assets/img/resources/displacy-ent.jpg similarity index 100% rename from website/assets/img/showcase/displacy-ent.jpg rename to website/assets/img/resources/displacy-ent.jpg diff --git a/website/assets/img/showcase/displacy.jpg b/website/assets/img/resources/displacy.jpg similarity index 100% rename from website/assets/img/showcase/displacy.jpg rename to website/assets/img/resources/displacy.jpg diff --git a/website/assets/img/resources/neuralcoref.jpg b/website/assets/img/resources/neuralcoref.jpg new file mode 100644 index 000000000..fb8984919 Binary files /dev/null and b/website/assets/img/resources/neuralcoref.jpg differ diff --git a/website/assets/img/showcase/sense2vec.jpg b/website/assets/img/resources/sense2vec.jpg similarity index 100% rename from website/assets/img/showcase/sense2vec.jpg rename to website/assets/img/resources/sense2vec.jpg diff --git a/website/assets/img/showcase/foxtype.jpg b/website/assets/img/showcase/foxtype.jpg deleted file mode 100644 index 91c23d866..000000000 Binary files a/website/assets/img/showcase/foxtype.jpg and /dev/null differ diff --git a/website/assets/img/showcase/indico.jpg b/website/assets/img/showcase/indico.jpg deleted file mode 100644 index 42b3ff13f..000000000 Binary files a/website/assets/img/showcase/indico.jpg and /dev/null differ diff --git a/website/assets/img/showcase/kip.jpg b/website/assets/img/showcase/kip.jpg deleted file mode 100644 index 714d90282..000000000 Binary files a/website/assets/img/showcase/kip.jpg and /dev/null differ diff --git a/website/assets/img/showcase/laice.jpg b/website/assets/img/showcase/laice.jpg deleted file mode 100644 index fb151b1fc..000000000 Binary files a/website/assets/img/showcase/laice.jpg and /dev/null differ diff --git a/website/assets/img/showcase/textanalysis.jpg b/website/assets/img/showcase/textanalysis.jpg deleted file mode 100644 index 162078257..000000000 Binary files a/website/assets/img/showcase/textanalysis.jpg and /dev/null differ diff --git a/website/assets/img/showcase/truthbot.jpg b/website/assets/img/showcase/truthbot.jpg deleted file mode 100644 index 4903f3d17..000000000 Binary files a/website/assets/img/showcase/truthbot.jpg and /dev/null differ diff --git a/website/assets/img/social/preview_101.jpg b/website/assets/img/social/preview_101.jpg index 448c63eaf..f65eaf870 100644 Binary files a/website/assets/img/social/preview_101.jpg and b/website/assets/img/social/preview_101.jpg differ diff --git a/website/assets/img/social/preview_alpha.jpg b/website/assets/img/social/preview_alpha.jpg index 6da622569..821db408a 100644 Binary files a/website/assets/img/social/preview_alpha.jpg and b/website/assets/img/social/preview_alpha.jpg differ diff --git a/website/assets/img/social/preview_docs.jpg b/website/assets/img/social/preview_docs.jpg deleted file mode 100644 index 46805ad7a..000000000 Binary files a/website/assets/img/social/preview_docs.jpg and /dev/null differ diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/tokenization.svg similarity index 98% rename from website/assets/img/docs/tokenization.svg rename to website/assets/img/tokenization.svg index f5b164725..9877e1a30 100644 --- a/website/assets/img/docs/tokenization.svg +++ b/website/assets/img/tokenization.svg @@ -1,7 +1,7 @@ diff --git a/website/assets/img/docs/training-loop.svg b/website/assets/img/training-loop.svg similarity index 95% rename from website/assets/img/docs/training-loop.svg rename to website/assets/img/training-loop.svg index e670f816a..e883b36be 100644 --- a/website/assets/img/docs/training-loop.svg +++ b/website/assets/img/training-loop.svg @@ -1,7 +1,7 @@ diff --git a/website/assets/img/docs/training.svg b/website/assets/img/training.svg similarity index 95% rename from website/assets/img/docs/training.svg rename to website/assets/img/training.svg index cd6b74f04..65f6de0b6 100644 --- a/website/assets/img/docs/training.svg +++ b/website/assets/img/training.svg @@ -1,7 +1,7 @@ diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/vocab_stringstore.svg similarity index 94% rename from website/assets/img/docs/vocab_stringstore.svg rename to website/assets/img/vocab_stringstore.svg index 119175247..b604041f2 100644 --- a/website/assets/img/docs/vocab_stringstore.svg +++ b/website/assets/img/vocab_stringstore.svg @@ -1,9 +1,9 @@ diff --git a/website/assets/js/changelog.js b/website/assets/js/changelog.js new file mode 100644 index 000000000..94f2149ad --- /dev/null +++ b/website/assets/js/changelog.js @@ -0,0 +1,72 @@ +'use strict'; + +import { Templater, handleResponse } from './util.js'; + +export default class Changelog { + /** + * Fetch and render changelog from GitHub. Clones a template node (table row) + * to avoid doubling templating markup in JavaScript. + * @param {string} user - GitHub username. + * @param {string} repo - Repository to fetch releases from. + */ + constructor(user, repo) { + this.url = `https://api.github.com/repos/${user}/${repo}/releases`; + this.template = new Templater('changelog'); + this.fetchChangelog() + .then(json => this.render(json)) + .catch(this.showError.bind(this)); + // make sure scroll positions for progress bar etc. are recalculated + window.dispatchEvent(new Event('resize')); + } + + fetchChangelog() { + return new Promise((resolve, reject) => + fetch(this.url) + .then(res => handleResponse(res)) + .then(json => json.ok ? resolve(json) : reject())) + } + + showError() { + this.template.get('error').style.display = 'block'; + } + + /** + * Get template section from template row. Hacky, but does make sense. + * @param {node} item - Parent element. + * @param {string} id - ID of child element, set via data-changelog. + */ + getField(item, id) { + return item.querySelector(`[data-changelog="${id}"]`); + } + + render(json) { + this.template.get('table').style.display = 'block'; + this.row = this.template.get('item'); + this.releases = this.template.get('releases'); + this.prereleases = this.template.get('prereleases'); + Object.values(json) + .filter(release => release.name) + .forEach(release => this.renderRelease(release)); + this.row.remove(); + } + + /** + * Clone the template row and populate with content from API response. + * https://developer.github.com/v3/repos/releases/#list-releases-for-a-repository + * @param {string} name - Release title. + * @param {string} tag (tag_name) - Release tag. + * @param {string} url (html_url) - URL to the release page on GitHub. + * @param {string} date (published_at) - Timestamp of release publication. + * @param {boolean} prerelease - Whether the release is a prerelease. + */ + renderRelease({ name, tag_name: tag, html_url: url, published_at: date, prerelease }) { + const container = prerelease ? this.prereleases : this.releases; + const tagLink = `${tag}`; + const title = (name.split(': ').length == 2) ? name.split(': ')[1] : name; + const row = this.row.cloneNode(true); + this.getField(row, 'date').textContent = date.split('T')[0]; + this.getField(row, 'tag').innerHTML = tagLink; + this.getField(row, 'title').textContent = title; + container.appendChild(row); + } +} diff --git a/website/assets/js/github-embed.js b/website/assets/js/github-embed.js new file mode 100644 index 000000000..ec72fd713 --- /dev/null +++ b/website/assets/js/github-embed.js @@ -0,0 +1,42 @@ +'use strict'; + +import { $$ } from './util.js'; + +export default class GitHubEmbed { + /** + * Embed code from GitHub repositories, similar to Gist embeds. Fetches the + * raw text and places it inside element. + * Usage:
+     * @param {string} user - GitHub user or organization.
+     * @param {string} attr - Data attribute used to select containers. Attribute
+     *                        value should be path to file relative to user.
+     */
+    constructor(user, attr) {
+        this.url = `https://raw.githubusercontent.com/${user}`;
+        this.attr = attr;
+        [...$$(`[${this.attr}]`)].forEach(el => this.embed(el));
+    }
+
+    /**
+     * Fetch code from GitHub and insert it as element content. File path is
+     * read off the container's data attribute.
+     * @param {node} el - The element.
+     */
+    embed(el) {
+        el.parentElement.setAttribute('data-loading', '');
+        fetch(`${this.url}/${el.getAttribute(this.attr)}`)
+            .then(res => res.text().then(text => ({ text, ok: res.ok })))
+            .then(({ text, ok }) => ok ? this.render(el, text) : false)
+        el.parentElement.removeAttribute('data-loading');
+    }
+
+    /**
+     * Add text to container and apply syntax highlighting via Prism, if available.
+     * @param {node} el - The element.
+     * @param {string} text - The raw code, fetched from GitHub.
+     */
+    render(el, text) {
+        el.textContent = text;
+        if (window.Prism) Prism.highlightElement(el);
+    }
+}
diff --git a/website/assets/js/main.js b/website/assets/js/main.js
deleted file mode 100644
index 616fbb1df..000000000
--- a/website/assets/js/main.js
+++ /dev/null
@@ -1,23 +0,0 @@
-//- 💫 MAIN JAVASCRIPT
-
-'use strict'
-
-{
-    const nav = document.querySelector('.js-nav')
-    const fixedClass = 'is-fixed'
-    let vh, scrollY = 0, scrollUp = false
-
-    const updateVh = () => Math.max(document.documentElement.clientHeight, window.innerHeight || 0)
-
-    const updateNav = () => {
-        const vh = updateVh()
-        const newScrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0)
-        if (newScrollY != scrollY) scrollUp = newScrollY <= scrollY
-        scrollY = newScrollY
-
-        if(scrollUp && !(isNaN(scrollY) || scrollY <= vh)) nav.classList.add(fixedClass)
-        else if (!scrollUp || (isNaN(scrollY) || scrollY <= vh/2)) nav.classList.remove(fixedClass)
-    }
-
-    window.addEventListener('scroll', () => requestAnimationFrame(updateNav))
-}
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
new file mode 100644
index 000000000..e79073edd
--- /dev/null
+++ b/website/assets/js/models.js
@@ -0,0 +1,316 @@
+'use strict';
+
+import { Templater, handleResponse, convertNumber } from './util.js';
+
+/**
+ * Chart.js defaults
+ */
+const CHART_COLORS = { model1: '#09a3d5', model2: '#066B8C' };
+const CHART_FONTS = {
+    legend: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"',
+    ticks: 'Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace'
+};
+
+/**
+ * Formatters for model details.
+ * @property {function} author – Format model author with optional link.
+ * @property {function} license - Format model license with optional link.
+ * @property {function} sources - Format training data sources (list or string).
+ * @property {function} pipeline - Format list of pipeline components.
+ * @property {function} vectors - Format vector data (entries and dimensions).
+ * @property {function} version - Format model version number.
+ */
+export const formats = {
+    author: (author, url) => url ? `${author}` : author,
+    license: (license, url) => url ? `${license}` : license,
+    sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
+    pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}`).join(', ') : '-',
+    vectors: vec => vec ? `${convertNumber(vec.entries)} (${vec.width} dimensions)` : 'n/a',
+    version: version => `v${version}`
+};
+
+/**
+ * Find the latest version of a model in a compatibility table.
+ * @param {string} model - The model name.
+ * @param {Object} compat - Compatibility table, keyed by spaCy version.
+ */
+export const getLatestVersion = (model, compat = {}) => {
+    for (let [spacy_v, models] of Object.entries(compat)) {
+        if (models[model]) return models[model][0];
+    }
+};
+
+export class ModelLoader {
+    /**
+     * Load model meta from GitHub and update model details on site. Uses the
+     * Templater mini template engine to update DOM.
+     * @param {string} repo - Path tp GitHub repository containing releases.
+     * @param {Array} models - List of model IDs, e.g. "en_core_web_sm".
+     * @param {Object} licenses - License IDs mapped to URLs.
+     * @param {Object} benchmarkKeys - Objects of available keys by type, e.g.
+     *                                 'parser', 'ner', 'speed', mapped to labels.
+     */
+    constructor(repo, models = [], licenses = {}, benchmarkKeys = {}) {
+        this.url = `https://raw.githubusercontent.com/${repo}/master`;
+        this.repo = `https://github.com/${repo}`;
+        this.modelIds = models;
+        this.licenses = licenses;
+        this.benchKeys = benchmarkKeys;
+        this.init();
+    }
+
+    init() {
+        this.modelIds.forEach(modelId =>
+            new Templater(modelId).get('table').setAttribute('data-loading', ''));
+        this.fetch(`${this.url}/compatibility.json`)
+            .then(json => this.getModels(json.spacy))
+            .catch(_ => this.modelIds.forEach(modelId => this.showError(modelId)));
+        // make sure scroll positions for progress bar etc. are recalculated
+        window.dispatchEvent(new Event('resize'));
+    }
+
+    fetch(url) {
+        return new Promise((resolve, reject) =>
+            fetch(url).then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(json) : reject()))
+    }
+
+    getModels(compat) {
+        this.compat = compat;
+        for (let modelId of this.modelIds) {
+            const version = getLatestVersion(modelId, compat);
+            if (version) this.fetch(`${this.url}/meta/${modelId}-${version}.json`)
+                .then(json => this.render(json))
+                .catch(_ => this.showError(modelId))
+            else this.showError(modelId);
+        }
+    }
+
+    showError(modelId) {
+        const tpl = new Templater(modelId);
+        tpl.get('table').removeAttribute('data-loading');
+        tpl.get('error').style.display = 'block';
+        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+            tpl.get(key).parentElement.parentElement.style.display = 'none';
+        }
+    }
+
+    /**
+     * Update model details in tables. Currently quite hacky :(
+     */
+    render(data) {
+        const modelId = `${data.lang}_${data.name}`;
+        const model = `${modelId}-${data.version}`;
+        const tpl = new Templater(modelId);
+        tpl.get('error').style.display = 'none';
+        this.renderDetails(tpl, data)
+        this.renderBenchmarks(tpl, data.accuracy, data.speed);
+        this.renderCompat(tpl, modelId);
+        tpl.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
+        tpl.get('table').removeAttribute('data-loading');
+    }
+
+    renderDetails(tpl, { version, size, description, notes, author, url,
+        license, sources, vectors, pipeline }) {
+        const basics = { version, size, description, notes }
+        for (let [key, value] of Object.entries(basics)) {
+            if (value) tpl.fill(key, value);
+        }
+        if (author) tpl.fill('author', formats.author(author, url), true);
+        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
+        if (sources) tpl.fill('sources', formats.sources(sources));
+        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
+        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
+        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
+    }
+
+    renderBenchmarks(tpl, accuracy = {}, speed = {}) {
+        if (!accuracy && !speed) return;
+        this.renderTable(tpl, 'parser', accuracy, val => val.toFixed(2));
+        this.renderTable(tpl, 'ner', accuracy, val => val.toFixed(2));
+        this.renderTable(tpl, 'speed', speed, Math.round);
+        tpl.get('benchmarks').style.display = 'block';
+    }
+
+    renderTable(tpl, id, benchmarks, converter = val => val) {
+        if (!this.benchKeys[id] || !Object.keys(this.benchKeys[id]).some(key => benchmarks[key])) return;
+        for (let key of Object.keys(this.benchKeys[id])) {
+            if (benchmarks[key]) tpl
+                .fill(key, convertNumber(converter(benchmarks[key])))
+                .parentElement.style.display = 'table-row';
+        }
+        tpl.get(id).style.display = 'block';
+    }
+
+    renderCompat(tpl, modelId) {
+        tpl.get('compat-wrapper').style.display = 'table-row';
+        const header = '';
+        const options = Object.keys(this.compat)
+            .map(v => ``)
+            .join('');
+        tpl
+            .fill('compat', header + options, true)
+            .addEventListener('change', ({ target: { value }}) =>
+                tpl.fill('compat-versions', this.getCompat(value, modelId), true))
+    }
+
+    getCompat(version, model) {
+        const res = this.compat[version][model];
+        return res ? `${model}-${res[0]}` : 'not compatible';
+    }
+}
+
+export class ModelComparer {
+    /**
+     * Compare to model meta files and render chart and comparison table.
+     * @param {string} repo - Path tp GitHub repository containing releases.
+     * @param {Object} licenses - License IDs mapped to URLs.
+     * @param {Object} benchmarkKeys - Objects of available keys by type, e.g.
+     *                                 'parser', 'ner', 'speed', mapped to labels.
+     * @param {Object} languages - Available languages, ID mapped to name.
+     * @param {Object} defaultModels - Models to compare on load, 'model1' and
+     *                                 'model2' mapped to model names.
+     */
+    constructor(repo, licenses = {}, benchmarkKeys = {}, languages = {}, labels = {}, defaultModels) {
+        this.url = `https://raw.githubusercontent.com/${repo}/master`;
+        this.repo = `https://github.com/${repo}`;
+        this.tpl = new Templater('compare');
+        this.benchKeys = benchmarkKeys;
+        this.licenses = licenses;
+        this.languages = languages;
+        this.labels = labels;
+        this.models = {};
+        this.colors = CHART_COLORS;
+        this.fonts = CHART_FONTS;
+        this.defaultModels = defaultModels;
+        this.tpl.get('result').style.display = 'block';
+        this.fetchCompat()
+            .then(compat => this.init(compat))
+            .catch(this.showError.bind(this))
+    }
+
+    init(compat) {
+        this.compat = compat;
+        const selectA = this.tpl.get('model1');
+        const selectB = this.tpl.get('model2');
+        selectA.addEventListener('change', this.onSelect.bind(this));
+        selectB.addEventListener('change', this.onSelect.bind(this));
+        this.chart = new Chart('chart_compare_accuracy', { type: 'bar', options: {
+            responsive: true,
+            legend: { position: 'bottom', labels: { fontFamily: this.fonts.legend, fontSize: 13 }},
+            scales: {
+                yAxes: [{ label: 'Accuracy', ticks: { min: 70, fontFamily: this.fonts.ticks }}],
+                xAxes: [{ barPercentage: 0.75, ticks: { fontFamily: this.fonts.ticks }}]
+            }
+        }});
+        if (this.defaultModels) {
+            selectA.value = this.defaultModels.model1;
+            selectB.value = this.defaultModels.model2;
+            this.getModels(this.defaultModels);
+        }
+    }
+
+    fetchCompat() {
+        return new Promise((resolve, reject) =>
+            fetch(`${this.url}/compatibility.json`)
+                .then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(json.spacy) : reject()))
+    }
+
+    fetchModel(name) {
+        const version = getLatestVersion(name, this.compat);
+        const modelName = `${name}-${version}`;
+        return new Promise((resolve, reject) => {
+            // resolve immediately if model already loaded, e.g. in this.models
+            if (this.models[name]) resolve(this.models[name]);
+            else fetch(`${this.url}/meta/${modelName}.json`)
+                .then(res => handleResponse(res))
+                .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
+        })
+    }
+
+    /**
+     * "Save" meta to this.models so it only has to be fetched from GitHub once.
+     * @param {string} name - The model name.
+     * @param {Object} data - The model meta data.
+     */
+    saveModel(name, data) {
+        this.models[name] = data;
+        return data;
+    }
+
+    showError() {
+        this.tpl.get('result').style.display = 'none';
+        this.tpl.get('error').style.display = 'block';
+    }
+
+    onSelect(ev) {
+        const modelId = ev.target.value;
+        const otherId = (ev.target.id == 'model1') ? 'model2' : 'model1';
+        const otherVal = this.tpl.get(otherId);
+        const otherModel = otherVal.options[otherVal.selectedIndex].value;
+        if (otherModel != '') this.getModels({
+            [ev.target.id]: modelId,
+            [otherId]: otherModel
+        })
+    }
+
+    getModels({ model1, model2 }) {
+        this.tpl.get('result').setAttribute('data-loading', '');
+        this.fetchModel(model1)
+            .then(data1 => this.fetchModel(model2)
+                .then(data2 => this.render({ model1: data1, model2: data2 })))
+                .catch(this.showError.bind(this))
+    }
+
+    /**
+     * Render two models, and populate the chart and table. Currently quite hacky :(
+     * @param {Object} models - The models to render.
+     * @param {Object} models.model1 - The first model (via first ).
+     */
+    render({ model1, model2 }) {
+        const accKeys = Object.assign({}, this.benchKeys.parser, this.benchKeys.ner);
+        const allKeys = [...Object.keys(model1.accuracy || []), ...Object.keys(model2.accuracy || [])];
+        const metaKeys = Object.keys(accKeys).filter(k => allKeys.includes(k));
+        const labels = metaKeys.map(key => accKeys[key]);
+        const datasets = [model1, model2]
+            .map(({ lang, name, version, accuracy = {} }, i) => ({
+                label: `${lang}_${name}-${version}`,
+                backgroundColor: this.colors[`model${i + 1}`],
+                data: metaKeys.map(key => (accuracy[key] || 0).toFixed(2))
+            }));
+        this.chart.data = { labels, datasets };
+        this.chart.update();
+        [model1, model2].forEach((model, i) => this.renderTable(metaKeys, i + 1, model));
+        this.tpl.get('result').removeAttribute('data-loading');
+    }
+
+    renderTable(metaKeys, i, { lang, name, version, size, description,
+        notes, author, url, license, sources, vectors, pipeline, accuracy = {},
+        speed = {}}) {
+        const type = name.split('_')[0];  // extract type from model name
+        const genre = name.split('_')[1];  // extract genre from model name
+        this.tpl.fill(`table-head${i}`, `${lang}_${name}`);
+        this.tpl.get(`link${i}`).setAttribute('href', `/models/${lang}#${lang}_${name}`);
+        this.tpl.fill(`download${i}`, `spacy download ${lang}_${name}\n`);
+        this.tpl.fill(`lang${i}`, this.languages[lang] || lang);
+        this.tpl.fill(`type${i}`, this.labels[type] || type);
+        this.tpl.fill(`genre${i}`, this.labels[genre] || genre);
+        this.tpl.fill(`version${i}`, formats.version(version), true);
+        this.tpl.fill(`size${i}`, size);
+        this.tpl.fill(`desc${i}`, description || 'n/a');
+        this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
+        this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+        this.tpl.fill(`sources${i}`, formats.sources(sources));
+        this.tpl.fill(`author${i}`, formats.author(author, url), true);
+        this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
+        // check if model accuracy or speed includes one of the pre-set keys
+        for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+            if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
+            else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
+            else this.tpl.fill(`${key}${i}`, 'n/a')
+        }
+    }
+}
diff --git a/website/assets/js/nav-highlighter.js b/website/assets/js/nav-highlighter.js
new file mode 100644
index 000000000..a7bb227d5
--- /dev/null
+++ b/website/assets/js/nav-highlighter.js
@@ -0,0 +1,35 @@
+'use strict';
+
+import { $, $$ } from './util.js';
+
+export default class NavHighlighter {
+    /**
+     * Hightlight section in viewport in sidebar, using in-view library.
+     * @param {string} sectionAttr - Data attribute of sections.
+     * @param {string} navAttr - Data attribute of navigation items.
+     * @param {string} activeClass – Class name of active element.
+     */
+    constructor(sectionAttr, navAttr, activeClass = 'is-active') {
+        this.sections = [...$$(`[${navAttr}]`)];
+        // highlight first item regardless
+        if (this.sections.length) this.sections[0].classList.add(activeClass);
+        this.navAttr = navAttr;
+        this.sectionAttr = sectionAttr;
+        this.activeClass = activeClass;
+        if (window.inView) inView(`[${sectionAttr}]`)
+            .on('enter', this.highlightSection.bind(this));
+    }
+
+    /**
+     * Check if section in view exists in sidebar and mark as active.
+     * @param {node} section - The section in view.
+     */
+    highlightSection(section) {
+        const id = section.getAttribute(this.sectionAttr);
+        const el = $(`[${this.navAttr}="${id}"]`);
+        if (el) {
+            this.sections.forEach(el => el.classList.remove(this.activeClass));
+            el.classList.add(this.activeClass);
+        }
+    }
+}
diff --git a/website/assets/js/progress.js b/website/assets/js/progress.js
new file mode 100644
index 000000000..1497547d8
--- /dev/null
+++ b/website/assets/js/progress.js
@@ -0,0 +1,52 @@
+'use strict';
+
+import { $ } from './util.js';
+
+export default class ProgressBar {
+    /**
+     * Animated reading progress bar.
+     * @param {string} selector – CSS selector of progress bar element.
+     */
+    constructor(selector) {
+        this.scrollY = 0;
+        this.sizes = this.updateSizes();
+        this.el = $(selector);
+        this.el.setAttribute('max', 100);
+        window.addEventListener('scroll', this.onScroll.bind(this));
+        window.addEventListener('resize', this.onResize.bind(this));
+    }
+
+    onScroll(ev) {
+        this.scrollY = (window.pageYOffset || document.scrollTop) - (document.clientTop || 0);
+        requestAnimationFrame(this.update.bind(this));
+    }
+
+    onResize(ev) {
+        this.sizes = this.updateSizes();
+        requestAnimationFrame(this.update.bind(this));
+    }
+
+    update() {
+        const offset = 100 - ((this.sizes.height - this.scrollY - this.sizes.vh) / this.sizes.height * 100);
+        this.el.setAttribute('value', (this.scrollY == 0) ? 0 : offset || 0);
+    }
+
+    /**
+     * Update scroll and viewport height. Called on load and window resize.
+     */
+    updateSizes() {
+        return {
+            height: Math.max(
+                document.body.scrollHeight,
+                document.body.offsetHeight,
+                document.documentElement.clientHeight,
+                document.documentElement.scrollHeight,
+                document.documentElement.offsetHeight
+            ),
+            vh: Math.max(
+                document.documentElement.clientHeight,
+                window.innerHeight || 0
+            )
+        }
+    }
+}
diff --git a/website/assets/js/rollup.js b/website/assets/js/rollup.js
new file mode 100644
index 000000000..00ff92fa9
--- /dev/null
+++ b/website/assets/js/rollup.js
@@ -0,0 +1,23 @@
+/**
+ * This file is bundled by Rollup, compiled with Babel and included as
+ *