diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index 8b9677709..000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,18 +0,0 @@ - - -## Your Environment - - - -- Operating System: -- Python Version Used: -- spaCy Version Used: -- Environment Information: diff --git a/.github/ISSUE_TEMPLATE/01_bugs.md b/.github/ISSUE_TEMPLATE/01_bugs.md index 9e1b35fbf..768832c24 100644 --- a/.github/ISSUE_TEMPLATE/01_bugs.md +++ b/.github/ISSUE_TEMPLATE/01_bugs.md @@ -1,6 +1,6 @@ --- -name: "\U0001F6A8 Bug Report" -about: Did you come across a bug or unexpected behaviour differing from the docs? +name: "\U0001F6A8 Submit a Bug Report" +about: Use this template if you came across a bug or unexpected behaviour differing from the docs. --- diff --git a/.github/ISSUE_TEMPLATE/02_docs.md b/.github/ISSUE_TEMPLATE/02_docs.md index 4cf791330..0df41abc1 100644 --- a/.github/ISSUE_TEMPLATE/02_docs.md +++ b/.github/ISSUE_TEMPLATE/02_docs.md @@ -1,5 +1,5 @@ --- -name: "\U0001F4DA Documentation" +name: "\U0001F4DA Submit a Documentation Report" about: Did you spot a mistake in the docs, is anything unclear or do you have a suggestion? diff --git a/.github/ISSUE_TEMPLATE/03_other.md b/.github/ISSUE_TEMPLATE/03_other.md deleted file mode 100644 index 4c6ada4cc..000000000 --- a/.github/ISSUE_TEMPLATE/03_other.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -name: "\U0001F4AC Anything else?" -about: For feature and project ideas, general usage questions or help with your code, please post on the GitHub Discussions board instead. ---- - - - -## Your Environment - - - -- Operating System: -- Python Version Used: -- spaCy Version Used: -- Environment Information: diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..09de1cd05 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,14 @@ +blank_issues_enabled: false +contact_links: + - name: 🗯 Discussions Forum + url: https://github.com/explosion/spaCy/discussions + about: Usage questions, general discussion and anything else that isn't a bug report. + - name: 📖 spaCy FAQ & Troubleshooting + url: https://github.com/explosion/spaCy/discussions/8226 + about: Before you post, check out the FAQ for answers to common community questions! + - name: 💫 spaCy Usage Guides & API reference + url: https://spacy.io/usage + about: Everything you need to know about spaCy and how to use it. + - name: 🛠 Submit a Pull Request + url: https://github.com/explosion/spaCy/pulls + about: Did you spot a mistake and know how to fix it? Feel free to submit a PR straight away! diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 750e096d0..50e81799e 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -11,6 +11,10 @@ steps: versionSpec: ${{ parameters.python_version }} architecture: ${{ parameters.architecture }} + - bash: | + echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" + displayName: 'Set variables' + - script: | ${{ parameters.prefix }} python -m pip install -U pip setuptools ${{ parameters.prefix }} python -m pip install -U -r requirements.txt @@ -41,7 +45,7 @@ steps: displayName: "Install test requirements" - script: | - ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 + ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html displayName: "Install GPU requirements" condition: eq(${{ parameters.gpu }}, true) @@ -55,3 +59,44 @@ steps: ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu displayName: "Run GPU tests" condition: eq(${{ parameters.gpu }}, true) + + - script: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + displayName: 'Test download CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . + displayName: 'Test convert CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy init config -p ner -l ca ner.cfg + python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy + displayName: 'Test debug config CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + # will have errors due to sparse data, check for summary in output + python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary + displayName: 'Test debug data CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 + displayName: 'Test train CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: 'Test assemble CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: 'Test assemble CLI vectors warning' + condition: eq(variables['python_version'], '3.8') diff --git a/.github/contributors/KennethEnevoldsen.md b/.github/contributors/KennethEnevoldsen.md new file mode 100644 index 000000000..0bbb28d61 --- /dev/null +++ b/.github/contributors/KennethEnevoldsen.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------------- | +| Name | Kenneth Enevoldsen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-07-13 | +| GitHub username | KennethEnevoldsen | +| Website (optional) | www.kennethenevoldsen.com | diff --git a/.github/contributors/ZeeD.md b/.github/contributors/ZeeD.md new file mode 100644 index 000000000..460f91e19 --- /dev/null +++ b/.github/contributors/ZeeD.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Vito De Tullio | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-06-01 | +| GitHub username | ZeeD | +| Website (optional) | | diff --git a/.github/contributors/bodak.md b/.github/contributors/bodak.md new file mode 100644 index 000000000..f87224f81 --- /dev/null +++ b/.github/contributors/bodak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Kristian Boda | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 18.05.2021 | +| GitHub username | bodak | +| Website (optional) | | diff --git a/.github/contributors/gtoffoli.md b/.github/contributors/gtoffoli.md new file mode 100644 index 000000000..5d5d712a2 --- /dev/null +++ b/.github/contributors/gtoffoli.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Giovanni Toffoli | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-05-12 | +| GitHub username | gtoffoli | +| Website (optional) | | diff --git a/.github/contributors/jklaise.md b/.github/contributors/jklaise.md new file mode 100644 index 000000000..66d77ee48 --- /dev/null +++ b/.github/contributors/jklaise.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Janis Klaise | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |26/04/2021 | +| GitHub username |jklaise | +| Website (optional) |janisklaise.com | diff --git a/.github/contributors/jmyerston.md b/.github/contributors/jmyerston.md new file mode 100644 index 000000000..be5db5453 --- /dev/null +++ b/.github/contributors/jmyerston.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ----------------------------------- | +| Name | Jacobo Myerston | +| Company name (if applicable) | University of California, San Diego | +| Title or role (if applicable) | Academic | +| Date | 07/05/2021 | +| GitHub username | jmyerston | +| Website (optional) | diogenet.ucsd.edu | diff --git a/.github/contributors/julien-talkair.md b/.github/contributors/julien-talkair.md new file mode 100644 index 000000000..f8a1933b2 --- /dev/null +++ b/.github/contributors/julien-talkair.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Julien Rossi | +| Company name (if applicable) | TalkAir BV | +| Title or role (if applicable) | CTO, Partner | +| Date | June 28 2021 | +| GitHub username | julien-talkair | +| Website (optional) | | diff --git a/.github/contributors/juliensalinas.md b/.github/contributors/juliensalinas.md new file mode 100644 index 000000000..0062426ba --- /dev/null +++ b/.github/contributors/juliensalinas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ------------------- | +| Name | Julien Salinas | +| Company name (if applicable) | NLP Cloud | +| Title or role (if applicable) | Founder and CTO | +| Date | Mayb 14th 2021 | +| GitHub username | juliensalinas | +| Website (optional) | https://nlpcloud.io | diff --git a/.github/contributors/mariosasko.md b/.github/contributors/mariosasko.md new file mode 100644 index 000000000..1f5acc934 --- /dev/null +++ b/.github/contributors/mariosasko.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Mario Šaško | +| Company name (if applicable) | TakeLab FER | +| Title or role (if applicable) | R&D Intern | +| Date | 2021-07-12 | +| GitHub username | mariosasko | +| Website (optional) | | diff --git a/.github/contributors/narayanacharya6.md b/.github/contributors/narayanacharya6.md new file mode 100644 index 000000000..e4bf7703f --- /dev/null +++ b/.github/contributors/narayanacharya6.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Narayan Acharya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 29 APR 2021 | +| GitHub username | narayanacharya6 | +| Website (optional) | narayanacharya.com | \ No newline at end of file diff --git a/.github/contributors/sevdimali.md b/.github/contributors/sevdimali.md new file mode 100644 index 000000000..6b96abdf8 --- /dev/null +++ b/.github/contributors/sevdimali.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sevdimali | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 10/4/2021 | +| GitHub username | sevdimali | +| Website (optional) | https://sevdimali.me | diff --git a/.github/contributors/thomashacker.md b/.github/contributors/thomashacker.md new file mode 100644 index 000000000..d88727dc8 --- /dev/null +++ b/.github/contributors/thomashacker.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Edward Schmuhl | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 09.07.2021 | +| GitHub username | thomashacker | +| Website (optional) | | diff --git a/.github/contributors/xadrianzetx.md b/.github/contributors/xadrianzetx.md new file mode 100644 index 000000000..65603e9bc --- /dev/null +++ b/.github/contributors/xadrianzetx.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Adrian Zuber | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |20-06-2021 | +| GitHub username |xadrianzetx | +| Website (optional) | | \ No newline at end of file diff --git a/.github/contributors/yohasebe.md b/.github/contributors/yohasebe.md new file mode 100644 index 000000000..c6f6167a3 --- /dev/null +++ b/.github/contributors/yohasebe.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Yoichiro Hasebe | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | July 4th, 2021 | +| GitHub username | yohasebe | +| Website (optional) | https://yohasebe.com | diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml new file mode 100644 index 000000000..8d0282650 --- /dev/null +++ b/.github/workflows/autoblack.yml @@ -0,0 +1,44 @@ +# GitHub Action that uses Black to reformat all Python code and submits a PR +# in regular intervals. Inspired by: https://github.com/cclauss/autoblack + +name: autoblack +on: + workflow_dispatch: # allow manual trigger + schedule: + - cron: '0 8 * * 5' # every Friday at 8am UTC + +jobs: + autoblack: + if: github.repository_owner == 'explosion' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.head_ref }} + - uses: actions/setup-python@v2 + - run: pip install black + - name: Auto-format code if needed + run: black spacy + # We can't run black --check here because that returns a non-zero excit + # code and makes GitHub think the action failed + - name: Check for modified files + id: git-check + run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) + - name: Create Pull Request + if: steps.git-check.outputs.modified == 'true' + uses: peter-evans/create-pull-request@v3 + with: + title: Auto-format code with black + labels: meta + commit-message: Auto-format code with black + committer: GitHub + author: explosion-bot + body: _This PR is auto-generated._ + branch: autoblack + delete-branch: true + draft: false + - name: Check outputs + if: steps.git-check.outputs.modified == 'true' + run: | + echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" + echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..a7a12fd24 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +repos: +- repo: https://github.com/ambv/black + rev: 21.6b0 + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + args: + - "--config=setup.cfg" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 072981270..3a94b9b67 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,11 +2,7 @@ # Contribute to spaCy -Thanks for your interest in contributing to spaCy 🎉 The project is maintained -by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and -**[@adrianeboyd](https://github.com/adrianeboyd)**, -and we'll do our best to help you get started. This page will give you a quick +Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick overview of how things are organized and most importantly, how to get involved. ## Table of contents @@ -181,6 +177,15 @@ tools installed. **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** +### Pre-Commit Hooks + +After cloning the repo, after installing the packages from `requirements.txt`, enter the repo folder and run `pre-commit install`. +Each time a `git commit` is initiated, `black` and `flake8` will run automatically on the modified files only. + +In case of error, or when `black` modified a file, the modified file needs to be `git add` once again and a new +`git commit` has to be issued. + + ### Code formatting [`black`](https://github.com/ambv/black) is an opinionated Python code diff --git a/Makefile b/Makefile index 53d0b4203..4de628663 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data==1.0.0 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 +override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 endif ifndef PYVER diff --git a/README.md b/README.md index 3bc7ba0f1..61d5449a4 100644 --- a/README.md +++ b/README.md @@ -61,11 +61,11 @@ open-source software, released under the MIT license. ## 💬 Where to ask questions The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and -**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't -be able to provide individual support via email. We also believe that help is -much more valuable if it's shared publicly, so that more people can benefit from -it. +**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, +**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +Please understand that we won't be able to provide individual support via email. +We also believe that help is much more valuable if it's shared publicly, so that +more people can benefit from it. | Type | Platforms | | ------------------------------- | --------------------------------------- | diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5840b916b..ac80b8a10 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,13 +22,13 @@ jobs: # defined in .flake8 and overwrites the selected codes. - job: "Validate" pool: - vmImage: "ubuntu-16.04" + vmImage: "ubuntu-18.04" steps: - task: UsePythonVersion@0 inputs: versionSpec: "3.7" - script: | - pip install flake8==3.5.0 + pip install flake8==3.9.2 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: "flake8" @@ -38,7 +38,7 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-16.04" + imageName: "ubuntu-18.04" python.version: "3.6" # Python36Windows: # imageName: "vs2017-win2016" @@ -47,7 +47,7 @@ jobs: # imageName: "macos-10.14" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-16.04" + # imageName: "ubuntu-18.04" # python.version: "3.7" Python37Windows: imageName: "vs2017-win2016" @@ -56,7 +56,7 @@ jobs: # imageName: "macos-10.14" # python.version: "3.7" # Python38Linux: - # imageName: "ubuntu-16.04" + # imageName: "ubuntu-18.04" # python.version: "3.8" # Python38Windows: # imageName: "vs2017-win2016" @@ -65,7 +65,7 @@ jobs: imageName: "macos-10.14" python.version: "3.8" Python39Linux: - imageName: "ubuntu-16.04" + imageName: "ubuntu-18.04" python.version: "3.9" Python39Windows: imageName: "vs2017-win2016" diff --git a/pyproject.toml b/pyproject.toml index 3e34a0b2d..07091123a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [build-system] requires = [ "setuptools", - "cython>=0.25", + "cython>=0.25,<3.0", "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.3,<8.1.0", + "thinc>=8.0.8,<8.1.0", "blis>=0.4.0,<0.8.0", "pathy", "numpy>=1.15.0", diff --git a/requirements.txt b/requirements.txt index a8a15a01b..ad8c70318 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,30 +1,31 @@ # Our libraries -spacy-legacy>=3.0.5,<3.1.0 +spacy-legacy>=3.0.7,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.3,<8.1.0 +thinc>=8.0.8,<8.1.0 blis>=0.4.0,<0.8.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 -catalogue>=2.0.3,<2.1.0 +catalogue>=2.0.4,<2.1.0 typer>=0.3.0,<0.4.0 pathy>=0.3.5 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.1,<1.8.0 +pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 # Official Python utilities setuptools packaging>=20.0 typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8" # Development dependencies -cython>=0.25 +pre-commit>=2.13.0 +cython>=0.25,<3.0 pytest>=5.2.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 -flake8>=3.5.0,<3.6.0 +flake8>=3.8.0,<3.10.0 hypothesis>=3.27.0,<7.0.0 diff --git a/setup.cfg b/setup.cfg index cd55911fe..1fa5b828d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,37 +22,40 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Topic :: Scientific/Engineering +project_urls = + Release notes = https://github.com/explosion/spaCy/releases + Source = https://github.com/explosion/spaCy [options] zip_safe = false include_package_data = true python_requires = >=3.6 setup_requires = - cython>=0.25 + cython>=0.25,<3.0 numpy>=1.15.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.3,<8.1.0 + thinc>=8.0.8,<8.1.0 install_requires = # Our libraries - spacy-legacy>=3.0.5,<3.1.0 + spacy-legacy>=3.0.7,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.3,<8.1.0 + thinc>=8.0.8,<8.1.0 blis>=0.4.0,<0.8.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 - catalogue>=2.0.3,<2.1.0 + catalogue>=2.0.4,<2.1.0 typer>=0.3.0,<0.4.0 pathy>=0.3.5 # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.7.1,<1.8.0 + pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 # Official Python utilities setuptools @@ -61,37 +64,37 @@ install_requires = [options.entry_points] console_scripts = - spacy = spacy.cli:app + spacy = spacy.cli:setup_cli [options.extras_require] lookups = - spacy_lookups_data>=1.0.1,<1.1.0 + spacy_lookups_data>=1.0.2,<1.1.0 transformers = spacy_transformers>=1.0.1,<1.1.0 ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<9.0.0 + cupy>=5.0.0b4,<10.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<9.0.0 + cupy-cuda80>=5.0.0b4,<10.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<9.0.0 + cupy-cuda90>=5.0.0b4,<10.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<9.0.0 + cupy-cuda91>=5.0.0b4,<10.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<9.0.0 + cupy-cuda92>=5.0.0b4,<10.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<9.0.0 + cupy-cuda100>=5.0.0b4,<10.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<9.0.0 + cupy-cuda101>=5.0.0b4,<10.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<9.0.0 + cupy-cuda102>=5.0.0b4,<10.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<9.0.0 + cupy-cuda110>=5.0.0b4,<10.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<9.0.0 + cupy-cuda111>=5.0.0b4,<10.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<9.0.0 + cupy-cuda112>=5.0.0b4,<10.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.9 @@ -108,7 +111,7 @@ universal = false formats = gztar [flake8] -ignore = E203, E266, E501, E731, W503, E741 +ignore = E203, E266, E501, E731, W503, E741, F541 max-line-length = 80 select = B,C,E,F,W,T4,B9 exclude = diff --git a/spacy/__init__.py b/spacy/__init__.py index d07931cfd..ca47edc94 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,7 +4,8 @@ import sys # set library-specific custom warning handling before doing anything else from .errors import setup_default_warnings -setup_default_warnings() + +setup_default_warnings() # noqa: E402 # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 diff --git a/spacy/about.py b/spacy/about.py index dc521045c..51154dc1a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.1.0.dev0" +__version__ = "3.1.1" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index b15db7599..9122de17b 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -74,7 +74,6 @@ IDS = { "SUFFIX": SUFFIX, "LENGTH": LENGTH, - "CLUSTER": CLUSTER, "LEMMA": LEMMA, "POS": POS, "TAG": TAG, @@ -85,9 +84,7 @@ IDS = { "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, - "SENT_END": SENT_END, "SPACY": SPACY, - "PROB": PROB, "LANG": LANG, "MORPH": MORPH, "IDX": IDX diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 228cc622a..ed1e840a5 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -2,7 +2,7 @@ from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECK import sys import shutil from pathlib import Path -from wasabi import msg +from wasabi import msg, Printer import srsly import hashlib import typer @@ -504,12 +504,16 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in return result -def setup_gpu(use_gpu: int) -> None: +def setup_gpu(use_gpu: int, silent=None) -> None: """Configure the GPU and log info.""" + if silent is None: + local_msg = Printer() + else: + local_msg = Printer(no_print=silent, pretty=not silent) if use_gpu >= 0: - msg.info(f"Using GPU: {use_gpu}") + local_msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) else: - msg.info("Using CPU") + local_msg.info("Using CPU") if has_cupy and gpu_is_available(): - msg.info("To switch to GPU 0, use the option: --gpu-id 0") + local_msg.info("To switch to GPU 0, use the option: --gpu-id 0") diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py index f63c51857..1cfa290a3 100644 --- a/spacy/cli/assemble.py +++ b/spacy/cli/assemble.py @@ -6,7 +6,6 @@ import logging from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code -from ..training.initialize import init_nlp from .. import util from ..util import get_sourced_components, load_model_from_config diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 3351e53fe..3f368f57d 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -101,13 +101,14 @@ def debug_data( # Create the gold corpus to be able to better analyze data dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + + nlp.initialize(lambda: train_corpus(nlp)) + msg.good("Pipeline can be initialized with data") + train_dataset = list(train_corpus(nlp)) dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") - nlp.initialize(lambda: train_dataset) - msg.good("Pipeline can be initialized with data") - # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold( @@ -173,8 +174,9 @@ def debug_data( ) n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( - "{} words in training data without vectors ({:0.2f}%)".format( - n_missing_vectors, n_missing_vectors / gold_train_data["n_words"] + "{} words in training data without vectors ({:.0f}%)".format( + n_missing_vectors, + 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( @@ -282,42 +284,7 @@ def debug_data( labels = _get_labels_from_model(nlp, "textcat") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) - labels_with_counts = _format_labels( - gold_train_data["cats"].most_common(), counts=True - ) - msg.text(f"Labels in train data: {labels_with_counts}", show=verbose) - missing_labels = labels - set(gold_train_data["cats"].keys()) - if missing_labels: - msg.warn( - "Some model labels are not present in the train data. The " - "model performance may be degraded for these labels after " - f"training: {_format_labels(missing_labels)}." - ) - if gold_train_data["n_cats_multilabel"] > 0: - # Note: you should never get here because you run into E895 on - # initialization first. - msg.warn( - "The train data contains instances without " - "mutually-exclusive classes. Use the component " - "'textcat_multilabel' instead of 'textcat'." - ) - if gold_dev_data["n_cats_multilabel"] > 0: - msg.fail( - "Train/dev mismatch: the dev data contains instances " - "without mutually-exclusive classes while the train data " - "contains only instances with mutually-exclusive classes." - ) - - if "textcat_multilabel" in factory_names: - msg.divider("Text Classification (Multilabel)") - labels = _get_labels_from_model(nlp, "textcat_multilabel") - msg.info(f"Text Classification: {len(labels)} label(s)") - msg.text(f"Labels: {_format_labels(labels)}", show=verbose) - labels_with_counts = _format_labels( - gold_train_data["cats"].most_common(), counts=True - ) - msg.text(f"Labels in train data: {labels_with_counts}", show=verbose) - missing_labels = labels - set(gold_train_data["cats"].keys()) + missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " @@ -325,17 +292,76 @@ def debug_data( f"training: {_format_labels(missing_labels)}." ) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): - msg.fail( - f"The train and dev labels are not the same. " + msg.warn( + "Potential train/dev mismatch: the train and dev labels are " + "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) + if len(labels) < 2: + msg.fail( + "The model does not have enough labels. 'textcat' requires at " + "least two labels due to mutually-exclusive classes, e.g. " + "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " + "classification task." + ) + if ( + gold_train_data["n_cats_bad_values"] > 0 + or gold_dev_data["n_cats_bad_values"] > 0 + ): + msg.fail( + "Unsupported values for cats: the supported values are " + "1.0/True and 0.0/False." + ) + if gold_train_data["n_cats_multilabel"] > 0: + # Note: you should never get here because you run into E895 on + # initialization first. + msg.fail( + "The train data contains instances without mutually-exclusive " + "classes. Use the component 'textcat_multilabel' instead of " + "'textcat'." + ) + if gold_dev_data["n_cats_multilabel"] > 0: + msg.fail( + "The dev data contains instances without mutually-exclusive " + "classes. Use the component 'textcat_multilabel' instead of " + "'textcat'." + ) + + if "textcat_multilabel" in factory_names: + msg.divider("Text Classification (Multilabel)") + labels = _get_labels_from_model(nlp, "textcat_multilabel") + msg.info(f"Text Classification: {len(labels)} label(s)") + msg.text(f"Labels: {_format_labels(labels)}", show=verbose) + missing_labels = labels - set(gold_train_data["cats"]) + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." + ) + if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): + msg.warn( + "Potential train/dev mismatch: the train and dev labels are " + "not the same. " + f"Train labels: {_format_labels(gold_train_data['cats'])}. " + f"Dev labels: {_format_labels(gold_dev_data['cats'])}." + ) + if ( + gold_train_data["n_cats_bad_values"] > 0 + or gold_dev_data["n_cats_bad_values"] > 0 + ): + msg.fail( + "Unsupported values for cats: the supported values are " + "1.0/True and 0.0/False." + ) if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " - "dev data does not." + "dev data contains only instances with mutually-exclusive " + "classes." ) else: msg.warn( @@ -556,6 +582,7 @@ def _compile_gold( "n_nonproj": 0, "n_cycles": 0, "n_cats_multilabel": 0, + "n_cats_bad_values": 0, "texts": set(), } for eg in examples: @@ -599,7 +626,9 @@ def _compile_gold( data["ner"]["-"] += 1 if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) - if list(gold.cats.values()).count(1.0) != 1: + if any(val not in (0, 1) for val in gold.cats.values()): + data["n_cats_bad_values"] += 1 + if list(gold.cats.values()).count(1) != 1: data["n_cats_multilabel"] += 1 if "tagger" in factory_names: tags = eg.get_aligned("TAG", as_string=True) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 54c09c850..190094d81 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -1,10 +1,11 @@ -from typing import Dict, Any, Optional, Iterable +from typing import Dict, Any, Optional from pathlib import Path +import itertools from spacy.training import Example from spacy.util import resolve_dot_names from wasabi import msg -from thinc.api import fix_random_seed, set_dropout_rate, Adam +from thinc.api import fix_random_seed, set_dropout_rate from thinc.api import Model, data_validation, set_gpu_allocator import typer @@ -73,23 +74,24 @@ def debug_model_cli( msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) pipe = nlp.get_pipe(component) - if not hasattr(pipe, "model"): - msg.fail( - f"The component '{component}' does not specify an object that holds a Model.", - exits=1, - ) - model = pipe.model - debug_model(config, T, nlp, model, print_settings=print_settings) + + debug_model(config, T, nlp, pipe, print_settings=print_settings) def debug_model( config, resolved_train_config, nlp, - model: Model, + pipe, *, print_settings: Optional[Dict[str, Any]] = None, ): + if not hasattr(pipe, "model"): + msg.fail( + f"The component '{pipe}' does not specify an object that holds a Model.", + exits=1, + ) + model = pipe.model if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", @@ -105,8 +107,6 @@ def debug_model( _print_model(model, print_settings) # STEP 1: Initializing the model and printing again - X = _get_docs() - # The output vector might differ from the official type of the output layer with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] @@ -114,15 +114,17 @@ def debug_model( (train_corpus,) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") + examples = list(itertools.islice(train_corpus(nlp), 5)) except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): - nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) + examples = [Example.from_dict(x, {}) for x in _get_docs()] + nlp.initialize(lambda: examples) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( - "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", + "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.", exits=1, ) @@ -131,28 +133,26 @@ def debug_model( _print_model(model, print_settings) # STEP 2: Updating the model and printing again - optimizer = Adam(0.001) set_dropout_rate(model, 0.2) - # ugly hack to deal with Tok2Vec listeners - tok2vec = None - if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener": - tok2vec = nlp.get_pipe("tok2vec") - goldY = None + # ugly hack to deal with Tok2Vec/Transformer listeners + upstream_component = None + if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name: + upstream_component = nlp.get_pipe("tok2vec") + if ( + model.has_ref("tok2vec") + and "transformer-listener" in model.get_ref("tok2vec").name + ): + upstream_component = nlp.get_pipe("transformer") for e in range(3): - if tok2vec: - tok2vec.update([Example.from_dict(x, {}) for x in X]) - Y, get_dX = model.begin_update(X) - if goldY is None: - goldY = _simulate_gold(Y) - dY = get_gradient(goldY, Y, model.ops) - get_dX(dY) - model.finish_update(optimizer) + if upstream_component: + upstream_component.update(examples) + pipe.update(examples) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction - prediction = model.predict(X) + prediction = model.predict([ex.predicted for ex in examples]) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) @@ -160,19 +160,6 @@ def debug_model( msg.good(f"Succesfully ended analysis - model looks good.") -def get_gradient(goldY, Y, ops): - return ops.asarray(Y) - ops.asarray(goldY) - - -def _simulate_gold(element, counter=1): - if isinstance(element, Iterable): - for i in range(len(element)): - element[i] = _simulate_gold(element[i], counter + i) - return element - else: - return 1 / counter - - def _sentences(): return [ "Apple is looking at buying U.K. startup for $1 billion", @@ -209,11 +196,7 @@ def _print_model(model, print_settings): if dimensions: for name in node.dim_names: - if node.has_dim(name): - msg.info(f" - dim {name}: {node.get_dim(name)}") - else: - msg.info(f" - dim {name}: {node.has_dim(name)}") - + msg.info(f" - dim {name}: {node.maybe_get_dim(name)}") if parameters: for name in node.param_names: if node.has_param(name): diff --git a/spacy/cli/download.py b/spacy/cli/download.py index d09d5147a..4ea9a8f0e 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -6,7 +6,7 @@ import typer from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX from .. import about -from ..util import is_package, get_base_version, run_command +from ..util import is_package, get_minor_version, run_command from ..errors import OLD_MODEL_SHORTCUTS @@ -74,7 +74,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) - def get_compatibility() -> dict: - version = get_base_version(about.__version__) + version = get_minor_version(about.__version__) r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 368af8d49..378911a20 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Any, Union from wasabi import Printer from pathlib import Path import re @@ -60,10 +60,11 @@ def evaluate( displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, -) -> Scorer: + spans_key: str = "sc", +) -> Dict[str, Any]: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() - setup_gpu(use_gpu) + setup_gpu(use_gpu, silent=silent) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) @@ -90,6 +91,9 @@ def evaluate( "SENT P": "sents_p", "SENT R": "sents_r", "SENT F": "sents_f", + "SPAN P": f"spans_{spans_key}_p", + "SPAN R": f"spans_{spans_key}_r", + "SPAN F": f"spans_{spans_key}_f", "SPEED": "speed", } results = {} @@ -108,27 +112,7 @@ def evaluate( data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] msg.table(results, title="Results") - - if "morph_per_feat" in scores: - if scores["morph_per_feat"]: - print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") - data["morph_per_feat"] = scores["morph_per_feat"] - if "dep_las_per_type" in scores: - if scores["dep_las_per_type"]: - print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") - data["dep_las_per_type"] = scores["dep_las_per_type"] - if "ents_per_type" in scores: - if scores["ents_per_type"]: - print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") - data["ents_per_type"] = scores["ents_per_type"] - if "cats_f_per_type" in scores: - if scores["cats_f_per_type"]: - print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") - data["cats_f_per_type"] = scores["cats_f_per_type"] - if "cats_auc_per_type" in scores: - if scores["cats_auc_per_type"]: - print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) - data["cats_auc_per_type"] = scores["cats_auc_per_type"] + data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] @@ -151,6 +135,43 @@ def evaluate( return data +def handle_scores_per_type( + scores: Union[Scorer, Dict[str, Any]], + data: Dict[str, Any] = {}, + *, + spans_key: str = "sc", + silent: bool = False, +) -> Dict[str, Any]: + msg = Printer(no_print=silent, pretty=not silent) + if "morph_per_feat" in scores: + if scores["morph_per_feat"]: + print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") + data["morph_per_feat"] = scores["morph_per_feat"] + if "dep_las_per_type" in scores: + if scores["dep_las_per_type"]: + print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") + data["dep_las_per_type"] = scores["dep_las_per_type"] + if "ents_per_type" in scores: + if scores["ents_per_type"]: + print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") + data["ents_per_type"] = scores["ents_per_type"] + if f"spans_{spans_key}_per_type" in scores: + if scores[f"spans_{spans_key}_per_type"]: + print_prf_per_type( + msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type" + ) + data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"] + if "cats_f_per_type" in scores: + if scores["cats_f_per_type"]: + print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") + data["cats_f_per_type"] = scores["cats_f_per_type"] + if "cats_auc_per_type" in scores: + if scores["cats_auc_per_type"]: + print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) + data["cats_auc_per_type"] = scores["cats_auc_per_type"] + return scores + + def render_parses( docs: List[Doc], output_path: Path, diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 7c262d84d..2a920cdda 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -108,6 +108,10 @@ def init_labels_cli( config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): nlp = init_nlp(config, use_gpu=use_gpu) + _init_labels(nlp, output_path) + + +def _init_labels(nlp, output_path): for name, component in nlp.pipeline: if getattr(component, "label_data", None) is not None: output_file = output_path / f"{name}.json" diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 58e191f65..342baa8ab 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,7 +1,7 @@ from typing import Optional, Union, Any, Dict, List, Tuple import shutil from pathlib import Path -from wasabi import Printer, get_raw_input +from wasabi import Printer, MarkdownRenderer, get_raw_input import srsly import sys @@ -18,7 +18,7 @@ def package_cli( output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), - create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."), @@ -133,7 +133,15 @@ def package( for file_name in FILENAMES_DOCS: file_path = package_path / model_name_v / file_name if file_path.exists(): - shutil.move(str(file_path), str(main_path)) + shutil.copy(str(file_path), str(main_path)) + readme_path = main_path / "README.md" + if not readme_path.exists(): + readme = generate_readme(meta) + create_file(readme_path, readme) + create_file(package_path / model_name_v / "README.md", readme) + msg.good("Generated README.md from meta.json") + else: + msg.info("Using existing README.md from pipeline directory") imports = [] for code_path in code_paths: imports.append(code_path.stem) @@ -197,8 +205,9 @@ def get_meta( "url": "", "license": "MIT", } - meta.update(existing_meta) nlp = util.load_model_from_path(Path(model_path)) + meta.update(nlp.meta) + meta.update(existing_meta) meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["vectors"] = { "width": nlp.vocab.vectors_length, @@ -234,6 +243,113 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any] return meta +def generate_readme(meta: Dict[str, Any]) -> str: + """ + Generate a Markdown-formatted README text from a model meta.json. Used + within the GitHub release notes and as content for README.md file added + to model packages. + """ + md = MarkdownRenderer() + lang = meta["lang"] + name = f"{lang}_{meta['name']}" + version = meta["version"] + pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])]) + components = ", ".join([md.code(p) for p in meta.get("components", [])]) + vecs = meta.get("vectors", {}) + vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)" + author = meta.get("author") or "n/a" + notes = meta.get("notes", "") + license_name = meta.get("license") + sources = _format_sources(meta.get("sources")) + description = meta.get("description") + label_scheme = _format_label_scheme(meta.get("labels")) + accuracy = _format_accuracy(meta.get("performance")) + table_data = [ + (md.bold("Name"), md.code(name)), + (md.bold("Version"), md.code(version)), + (md.bold("spaCy"), md.code(meta["spacy_version"])), + (md.bold("Default Pipeline"), pipeline), + (md.bold("Components"), components), + (md.bold("Vectors"), vectors), + (md.bold("Sources"), sources or "n/a"), + (md.bold("License"), md.code(license_name) if license_name else "n/a"), + (md.bold("Author"), md.link(author, meta["url"]) if "url" in meta else author), + ] + # Put together Markdown body + if description: + md.add(description) + md.add(md.table(table_data, ["Feature", "Description"])) + if label_scheme: + md.add(md.title(3, "Label Scheme")) + md.add(label_scheme) + if accuracy: + md.add(md.title(3, "Accuracy")) + md.add(accuracy) + if notes: + md.add(notes) + return md.text + + +def _format_sources(data: Any) -> str: + if not data or not isinstance(data, list): + return "n/a" + sources = [] + for source in data: + if not isinstance(source, dict): + source = {"name": source} + name = source.get("name") + if not name: + continue + url = source.get("url") + author = source.get("author") + result = name if not url else "[{}]({})".format(name, url) + if author: + result += " ({})".format(author) + sources.append(result) + return "
".join(sources) + + +def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str: + if not data: + return "" + md = MarkdownRenderer() + scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))] + scores = [ + (md.code(acc.upper()), f"{score*100:.2f}") + for acc, score in scalars + if acc not in exclude + ] + md.add(md.table(scores, ["Type", "Score"])) + return md.text + + +def _format_label_scheme(data: Dict[str, Any]) -> str: + if not data: + return "" + md = MarkdownRenderer() + n_labels = 0 + n_pipes = 0 + label_data = [] + for pipe, labels in data.items(): + if not labels: + continue + col1 = md.bold(md.code(pipe)) + col2 = ", ".join( + [md.code(label.replace("|", "\\|")) for label in labels] + ) # noqa: W605 + label_data.append((col1, col2)) + n_labels += len(labels) + n_pipes += 1 + if not label_data: + return "" + label_info = f"View label scheme ({n_labels} labels for {n_pipes} components)" + md.add("
") + md.add(f"{label_info}") + md.add(md.table(label_data, ["Component", "Labels"])) + md.add("
") + return md.text + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io @@ -248,6 +364,13 @@ def load_meta(fp): return json.load(f) +def load_readme(fp): + if path.exists(fp): + with io.open(fp, encoding='utf8') as f: + return f.read() + return "" + + def list_files(data_dir): output = [] for root, _, filenames in walk(data_dir): @@ -273,6 +396,8 @@ def setup_package(): root = path.abspath(path.dirname(__file__)) meta_path = path.join(root, 'meta.json') meta = load_meta(meta_path) + readme_path = path.join(root, 'README.md') + readme = load_readme(readme_path) model_name = str(meta['lang'] + '_' + meta['name']) model_dir = path.join(model_name, model_name + '-' + meta['version']) @@ -282,6 +407,7 @@ def setup_package(): setup( name=model_name, description=meta.get('description'), + long_description=readme, author=meta.get('author'), author_email=meta.get('email'), url=meta.get('url'), @@ -303,6 +429,8 @@ if __name__ == '__main__': TEMPLATE_MANIFEST = """ include meta.json include LICENSE +include LICENSES_SOURCES +include README.md """.strip() diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 1f8fc99cc..fe3ce0dad 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): "then the new directory will be created for you.", ) if resume_path is not None: + if resume_path.is_dir(): + # This is necessary because Windows gives a Permission Denied when we + # try to open the directory later, which is confusing. See #7878 + msg.fail( + "--resume-path should be a weights file, but {resume_path} is a directory.", + exits=True, + ) model_name = re.search(r"model\d+\.bin", str(resume_path)) if not model_name and not epoch_resume: msg.fail( diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 5339d2a21..ececc2507 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -212,6 +212,9 @@ def check_rerun( strict_version (bool): RETURNS (bool): Whether to re-run the command. """ + # Always rerun if no-skip is set + if command.get("no_skip", False): + return True lock_path = project_dir / PROJECT_LOCK if not lock_path.exists(): # We don't have a lockfile, run command return True diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0d422318b..339fb1e96 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -151,14 +151,14 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" [components.textcat.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -182,14 +182,14 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" [components.textcat_multilabel.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -316,14 +316,14 @@ nO = null width = ${components.tok2vec.model.encode.width} [components.textcat.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -344,14 +344,14 @@ nO = null width = ${components.tok2vec.model.encode.width} [components.textcat_multilabel.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -418,7 +418,7 @@ compound = 1.001 [initialize] {% if use_transformer or optimize == "efficiency" or not word_vectors -%} -vectors = null +vectors = ${paths.vectors} {% else -%} vectors = "{{ word_vectors }}" {% endif -%} diff --git a/spacy/cli/train.py b/spacy/cli/train.py index dc5b332d7..2932edd3b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -28,7 +28,7 @@ def train_cli( """ Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from other formats, use the `spacy convert` command. The - config file includes all settings and hyperparameters used during traing. + config file includes all settings and hyperparameters used during training. To override settings in the config, e.g. settings that point to local paths or that you want to experiment with, you can override them as command line options. For instance, --training.batch_size 128 overrides diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 67fc16383..a727e380e 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -3,10 +3,11 @@ from pathlib import Path import sys import requests from wasabi import msg, Printer +import warnings from ._util import app from .. import about -from ..util import get_package_version, get_installed_models, get_base_version +from ..util import get_package_version, get_installed_models, get_minor_version from ..util import get_package_path, get_model_meta, is_compatible_version @@ -24,7 +25,7 @@ def validate_cli(): def validate() -> None: model_pkgs, compat = get_model_pkgs() - spacy_version = get_base_version(about.__version__) + spacy_version = get_minor_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible packages found for v{spacy_version} of spaCy") @@ -44,8 +45,8 @@ def validate() -> None: comp = msg.text("", color="green", icon="good", no_print=True) version = msg.text(data["version"], color="green", no_print=True) else: - version = msg.text(data["version"], color="red", no_print=True) - comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" + version = msg.text(data["version"], color="yellow", no_print=True) + comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}" rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: @@ -78,7 +79,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: msg.good("Loaded compatibility table") compat = r.json()["spacy"] all_models = set() - installed_models = get_installed_models() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W09[45]") + installed_models = get_installed_models() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): @@ -92,7 +95,9 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: spacy_version = about.__version__ else: model_path = get_package_path(package) - model_meta = get_model_meta(model_path) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W09[45]") + model_meta = get_model_meta(model_path) spacy_version = model_meta.get("spacy_version", "n/a") is_compat = is_compatible_version(about.__version__, spacy_version) pkgs[pkg_name] = { diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index aa61fb9f7..78b83f2e5 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -120,7 +120,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: doc (Doc): Document do parse. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ - doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"])) + doc = Doc(orig_doc.vocab).from_bytes( + orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) + ) if not doc.has_annotation("DEP"): warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): diff --git a/spacy/errors.py b/spacy/errors.py index 2e8cc4494..5651ab0fa 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -150,12 +150,12 @@ class Warnings: "released, because the model may say it's compatible when it's " 'not. Consider changing the "spacy_version" in your meta.json to a ' "version range, with a lower and upper pin. For example: {example}") - W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is " - "incompatible with the current version ({current}). This may lead " - "to unexpected results or runtime errors. To resolve this, " - "download a newer compatible model or retrain your custom model " - "with the current spaCy version. For more details and available " - "updates, run: python -m spacy validate") + W095 = ("Model '{model}' ({model_version}) was trained with spaCy " + "{version} and may not be 100% compatible with the current version " + "({current}). If you see errors or degraded performance, download " + "a newer compatible model or retrain your custom model with the " + "current spaCy version. For more details and available updates, " + "run: python -m spacy validate") W096 = ("The method `nlp.disable_pipes` is now deprecated - use " "`nlp.select_pipes` instead.") W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " @@ -406,21 +406,10 @@ class Errors: E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") - E129 = ("Cannot write the label of an existing Span object because a Span " - "is a read-only view of the underlying Token objects stored in the " - "Doc. Instead, create a new Span object and specify the `label` " - "keyword argument, for example:\nfrom spacy.tokens import Span\n" - "span = Span(doc, start={start}, end={end}, label='{label}')") E130 = ("You are running a narrow unicode build, which is incompatible " "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide " "unicode build instead. You can also rebuild Python and set the " "`--enable-unicode=ucs4 flag`.") - E131 = ("Cannot write the kb_id of an existing Span object because a Span " - "is a read-only view of the underlying Token objects stored in " - "the Doc. Instead, create a new Span object and specify the " - "`kb_id` keyword argument, for example:\nfrom spacy.tokens " - "import Span\nspan = Span(doc, start={start}, end={end}, " - "label='{label}', kb_id='{kb_id}')") E132 = ("The vectors for entities and probabilities for alias '{alias}' " "should have equal length, but found {entities_length} and " "{probabilities_length} respectively.") @@ -532,6 +521,24 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E867 = ("The 'textcat' component requires at least two labels because it " + "uses mutually exclusive classes where exactly one label is True " + "for each doc. For binary classification tasks, you can use two " + "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you " + "can use the 'textcat_multilabel' component with one label.") + E868 = ("Found a conflicting gold annotation in a reference document, " + "with the following char-based span occurring both in the gold ents " + "as well as in the negative spans: {span}.") + E869 = ("The notation '{label}' is not supported anymore. To annotate " + "negative NER samples, use `doc.spans[key]` instead, and " + "specify the key as 'incorrect_spans_key' when constructing " + "the NER component.") + E870 = ("Could not serialize the DocBin because it is too large. Consider " + "splitting up your documents into several doc bins and serializing " + "each separately. spacy.Corpus.v1 will search recursively for all " + "*.spacy files if you provide a directory instead of a filename as " + "the 'path'.") + E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}") E872 = ("Unable to copy tokenizer from base model due to different " 'tokenizer settings: current tokenizer config "{curr_config}" ' 'vs. base model "{base_config}"') @@ -851,6 +858,15 @@ class Errors: "DependencyMatcher token patterns. The token pattern in " "RIGHT_ATTR should return matches that are each exactly one token " "long. Invalid pattern:\n{node}") + E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency " + "parses. If no dependency labels are available, provide " + "placeholder deps such as `deps=[\"dep\"]*len(heads)`.") + E1018 = ("Knowledge base for component '{name}' is not set. " + "Make sure either `nel.initialize` or `nel.set_kb` " + "is called with a `kb_loader` function.") + E1019 = ("`noun_chunks` requires the pos tagging, which requires a " + "statistical model to be installed and loaded. For more info, see " + "the documentation:\nhttps://spacy.io/usage/models") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/glossary.py b/spacy/glossary.py index c4a6a5c45..0dc075ca7 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -58,7 +58,7 @@ GLOSSARY = { "FW": "foreign word", "HYPH": "punctuation mark, hyphen", "IN": "conjunction, subordinating or preposition", - "JJ": "adjective", + "JJ": "adjective (English), other noun-modifier (Chinese)", "JJR": "adjective, comparative", "JJS": "adjective, superlative", "LS": "list item marker", @@ -88,7 +88,7 @@ GLOSSARY = { "WP": "wh-pronoun, personal", "WP$": "wh-pronoun, possessive", "WRB": "wh-adverb", - "SP": "space", + "SP": "space (English), sentence-final particle (Chinese)", "ADD": "email", "NFP": "superfluous punctuation", "GW": "additional word in multi-word expression", @@ -152,6 +152,40 @@ GLOSSARY = { "VVIZU": 'infinitive with "zu", full', "VVPP": "perfect participle, full", "XY": "non-word containing non-letter", + # POS Tags (Chinese) + # OntoNotes / Chinese Penn Treebank + # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports + "AD": "adverb", + "AS": "aspect marker", + "BA": "把 in ba-construction", + # "CD": "cardinal number", + "CS": "subordinating conjunction", + "DEC": "的 in a relative clause", + "DEG": "associative 的", + "DER": "得 in V-de const. and V-de-R", + "DEV": "地 before VP", + "ETC": "for words 等, 等等", + # "FW": "foreign words" + "IJ": "interjection", + # "JJ": "other noun-modifier", + "LB": "被 in long bei-const", + "LC": "localizer", + "M": "measure word", + "MSP": "other particle", + # "NN": "common noun", + "NR": "proper noun", + "NT": "temporal noun", + "OD": "ordinal number", + "ON": "onomatopoeia", + "P": "preposition excluding 把 and 被", + "PN": "pronoun", + "PU": "punctuation", + "SB": "被 in short bei-const", + # "SP": "sentence-final particle", + "VA": "predicative adjective", + "VC": "是 (copula)", + "VE": "有 as the main verb", + "VV": "other verb", # Noun chunks "NP": "noun phrase", "PP": "prepositional phrase", diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4d02b89d0..d8514b54c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -93,6 +93,15 @@ cdef class KnowledgeBase: self.vocab = vocab self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) + def initialize_entities(self, int64_t nr_entities): + self._entry_index = PreshMap(nr_entities + 1) + self._entries = entry_vec(nr_entities + 1) + self._vectors_table = float_matrix(nr_entities + 1) + + def initialize_aliases(self, int64_t nr_aliases): + self._alias_index = PreshMap(nr_aliases + 1) + self._aliases_table = alias_vec(nr_aliases + 1) + @property def entity_vector_length(self): """RETURNS (uint64): length of the entity vectors""" @@ -144,8 +153,7 @@ cdef class KnowledgeBase: raise ValueError(Errors.E140) nr_entities = len(set(entity_list)) - self._entry_index = PreshMap(nr_entities+1) - self._entries = entry_vec(nr_entities+1) + self.initialize_entities(nr_entities) i = 0 cdef KBEntryC entry @@ -325,6 +333,102 @@ cdef class KnowledgeBase: return 0.0 + def to_bytes(self, **kwargs): + """Serialize the current state to a binary string. + """ + def serialize_header(): + header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length) + return srsly.json_dumps(header) + + def serialize_entries(): + i = 1 + tuples = [] + for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): + entry = self._entries[entry_index] + assert entry.entity_hash == entry_hash + assert entry_index == i + tuples.append((entry.entity_hash, entry.freq, entry.vector_index)) + i = i + 1 + return srsly.json_dumps(tuples) + + def serialize_aliases(): + i = 1 + headers = [] + indices_lists = [] + probs_lists = [] + for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): + alias = self._aliases_table[alias_index] + assert alias_index == i + candidate_length = len(alias.entry_indices) + headers.append((alias_hash, candidate_length)) + indices_lists.append(alias.entry_indices) + probs_lists.append(alias.probs) + i = i + 1 + headers_dump = srsly.json_dumps(headers) + indices_dump = srsly.json_dumps(indices_lists) + probs_dump = srsly.json_dumps(probs_lists) + return srsly.json_dumps((headers_dump, indices_dump, probs_dump)) + + serializers = { + "header": serialize_header, + "entity_vectors": lambda: srsly.json_dumps(self._vectors_table), + "entries": serialize_entries, + "aliases": serialize_aliases, + } + return util.to_bytes(serializers, []) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load state from a binary string. + """ + def deserialize_header(b): + header = srsly.json_loads(b) + nr_entities = header[0] + nr_aliases = header[1] + entity_vector_length = header[2] + self.initialize_entities(nr_entities) + self.initialize_aliases(nr_aliases) + self.entity_vector_length = entity_vector_length + + def deserialize_vectors(b): + self._vectors_table = srsly.json_loads(b) + + def deserialize_entries(b): + cdef KBEntryC entry + tuples = srsly.json_loads(b) + i = 1 + for (entity_hash, freq, vector_index) in tuples: + entry.entity_hash = entity_hash + entry.freq = freq + entry.vector_index = vector_index + entry.feats_row = -1 # Features table currently not implemented + self._entries[i] = entry + self._entry_index[entity_hash] = i + i += 1 + + def deserialize_aliases(b): + cdef AliasC alias + i = 1 + all_data = srsly.json_loads(b) + headers = srsly.json_loads(all_data[0]) + indices = srsly.json_loads(all_data[1]) + probs = srsly.json_loads(all_data[2]) + for header, indices, probs in zip(headers, indices, probs): + alias_hash, candidate_length = header + alias.entry_indices = indices + alias.probs = probs + self._aliases_table[i] = alias + self._alias_index[alias_hash] = i + i += 1 + + setters = { + "header": deserialize_header, + "entity_vectors": deserialize_vectors, + "entries": deserialize_entries, + "aliases": deserialize_aliases, + } + util.from_bytes(bytes_data, setters, exclude) + return self + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): path = ensure_path(path) if not path.exists(): @@ -404,10 +508,8 @@ cdef class KnowledgeBase: cdef int64_t entity_vector_length reader.read_header(&nr_entities, &entity_vector_length) + self.initialize_entities(nr_entities) self.entity_vector_length = entity_vector_length - self._entry_index = PreshMap(nr_entities+1) - self._entries = entry_vec(nr_entities+1) - self._vectors_table = float_matrix(nr_entities+1) # STEP 1: load entity vectors cdef int i = 0 @@ -445,8 +547,7 @@ cdef class KnowledgeBase: # STEP 3: load aliases cdef int64_t nr_aliases reader.read_alias_length(&nr_aliases) - self._alias_index = PreshMap(nr_aliases+1) - self._aliases_table = alias_vec(nr_aliases+1) + self.initialize_aliases(nr_aliases) cdef int64_t nr_candidates cdef vector[int64_t] entry_indices diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py new file mode 100644 index 000000000..2937e2ecf --- /dev/null +++ b/spacy/lang/az/__init__.py @@ -0,0 +1,16 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from ...language import Language + + +class AzerbaijaniDefaults(Language.Defaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class Azerbaijani(Language): + lang = "az" + Defaults = AzerbaijaniDefaults + + +__all__ = ["Azerbaijani"] diff --git a/spacy/lang/az/examples.py b/spacy/lang/az/examples.py new file mode 100644 index 000000000..f3331a8cb --- /dev/null +++ b/spacy/lang/az/examples.py @@ -0,0 +1,18 @@ +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.az.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Bu bir cümlədir.", + "Necəsən?", + "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.", + "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.", + "Atılan növbəti mərmilər lap yaxınlıqda partladı.", + "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.", + "Marsda ilk sınaq uçuşu həyata keçirilib.", + "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.", + "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.", +] diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py new file mode 100644 index 000000000..73a5e2762 --- /dev/null +++ b/spacy/lang/az/lex_attrs.py @@ -0,0 +1,89 @@ +from ...attrs import LIKE_NUM + + +# Eleven, twelve etc. are written separate: on bir, on iki + +_num_words = [ + "bir", + "iki", + "üç", + "dörd", + "beş", + "altı", + "yeddi", + "səkkiz", + "doqquz", + "on", + "iyirmi", + "otuz", + "qırx", + "əlli", + "altmış", + "yetmiş", + "səksən", + "doxsan", + "yüz", + "min", + "milyon", + "milyard", + "trilyon", + "kvadrilyon", + "kentilyon", +] + + +_ordinal_words = [ + "birinci", + "ikinci", + "üçüncü", + "dördüncü", + "beşinci", + "altıncı", + "yedinci", + "səkkizinci", + "doqquzuncu", + "onuncu", + "iyirminci", + "otuzuncu", + "qırxıncı", + "əllinci", + "altmışıncı", + "yetmişinci", + "səksəninci", + "doxsanıncı", + "yüzüncü", + "mininci", + "milyonuncu", + "milyardıncı", + "trilyonuncu", + "kvadrilyonuncu", + "kentilyonuncu", +] + +_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü") + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + # Check cardinal number + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith(_ordinal_endings): + if text_lower[:-3].isdigit() or text_lower[:-4].isdigit(): + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py new file mode 100644 index 000000000..2114939ba --- /dev/null +++ b/spacy/lang/az/stop_words.py @@ -0,0 +1,145 @@ +# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py +STOP_WORDS = set( + """ +amma +arasında +artıq +ay +az +bax +belə +beş +bilər +bir +biraz +biri +birşey +biz +bizim +bizlər +bu +buna +bundan +bunların +bunu +bunun +buradan +bütün +bəli +bəlkə +bəy +bəzi +bəzən +daha +dedi +deyil +dir +düz +də +dək +dən +dəqiqə +edir +edən +elə +et +etdi +etmə +etmək +faiz +gilə +görə +ha +haqqında +harada +heç +hə +həm +həmin +həmişə +hər +idi +il +ildə +ilk +ilə +in +indi +istifadə +isə +ki +kim +kimi +kimə +lakin +lap +mirşey +məhz +mən +mənə +niyə +nə +nəhayət +o +obirisi +of +olan +olar +olaraq +oldu +olduğu +olmadı +olmaz +olmuşdur +olsun +olur +on +ona +ondan +onlar +onlardan +onların +onsuzda +onu +onun +oradan +qarşı +qədər +saat +sadəcə +saniyə +siz +sizin +sizlər +sonra +səhv +sən +sənin +sənə +təəssüf +var +və +xan +xanım +xeyr +ya +yalnız +yaxşı +yeddi +yenə +yox +yoxdur +yoxsa +yəni +zaman +çox +çünki +öz +özü +üçün +əgər +əlbəttə +ən +əslində +""".split() +) diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py index 62b69d6cc..bba3c74cd 100644 --- a/spacy/lang/bg/lex_attrs.py +++ b/spacy/lang/bg/lex_attrs.py @@ -22,13 +22,13 @@ _num_words = [ "тринадесет", "тринайсет", "четиринадесет", - "четиринайсет" + "четиринайсет", "петнадесет", - "петнайсет" + "петнайсет", "шестнадесет", "шестнайсет", "седемнадесет", - "седемнайсет" + "седемнайсет", "осемнадесет", "осемнайсет", "деветнадесет", @@ -36,7 +36,7 @@ _num_words = [ "двадесет", "двайсет", "тридесет", - "трийсет" + "трийсет", "четиридесет", "четиресет", "петдесет", diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py index defa00ef7..0b7487c64 100644 --- a/spacy/lang/bg/tokenizer_exceptions.py +++ b/spacy/lang/bg/tokenizer_exceptions.py @@ -58,7 +58,6 @@ _abbr_dot_exc = [ {ORTH: "стр.", NORM: "страница"}, {ORTH: "ул.", NORM: "улица"}, {ORTH: "чл.", NORM: "член"}, - ] for abbr in _abbr_dot_exc: diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 6fbc45817..9e5441a4f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -260,7 +260,10 @@ _units = ( "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб" "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب" ) -_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴" +_currency = ( + r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ " + r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿" +) # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index a049601dc..631848af4 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer): forms.extend(oov_forms) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py index fcc7c6bf2..b630a317d 100644 --- a/spacy/lang/en/lex_attrs.py +++ b/spacy/lang/en/lex_attrs.py @@ -35,7 +35,7 @@ def like_num(text: str) -> bool: # Check ordinal number if text_lower in _ordinal_words: return True - if text_lower.endswith("th"): + if text_lower.endswith(("st", "nd", "rd", "th")): if text_lower[:-2].isdigit(): return True return False diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 22d710cb0..465333b0a 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH +from ...symbols import ORTH, NORM from ...util import update_exc @@ -79,5 +79,34 @@ for exc_data in [ ]: _exc[exc_data[ORTH]] = [exc_data] +# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141 +conj_contraction_bases = [ + ("ett", "että"), + ("jott", "jotta"), + ("kosk", "koska"), + ("mutt", "mutta"), + ("vaikk", "vaikka"), + ("ehk", "ehkä"), + ("miks", "miksi"), + ("siks", "siksi"), + ("joll", "jos"), + ("ell", "jos"), +] +conj_contraction_negations = [ + ("en", "en"), + ("et", "et"), + ("ei", "ei"), + ("emme", "emme"), + ("ette", "ette"), + ("eivat", "eivät"), + ("eivät", "eivät"), +] +for (base_lower, base_norm) in conj_contraction_bases: + for base in [base_lower, base_lower.title()]: + for (suffix, suffix_norm) in conj_contraction_negations: + _exc[base + suffix] = [ + {ORTH: base, NORM: base_norm}, + {ORTH: suffix, NORM: suffix_norm}, + ] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ab1f2f4a7..b32ee3d71 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,30 +1,31 @@ STOP_WORDS = set( """ a à â abord afin ah ai aie ainsi ait allaient allons -alors anterieur anterieure anterieures apres après as assez attendu au -aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront +alors anterieur anterieure anterieures antérieur antérieure antérieures +apres après as assez attendu au +aupres auquel aura auraient aurait auront aussi autre autrement autres autrui aux auxquelles auxquels avaient avais avait avant avec avoir avons ayant bas basee bat -c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui -celui-ci celui-là cent cependant certain certaine certaines certains certes ces +c' c’ ça car ce ceci cela celle celle-ci celle-la celle-là celles celles-ci celles-la celles-là +celui celui-ci celui-la celui-là cent cependant certain certaine certaines certains certes ces cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante cinquantième cinquième combien comme comment compris concernant -d' d’ da dans de debout dedans dehors deja delà depuis derriere +d' d’ da dans de debout dedans dehors deja dejà delà depuis derriere derrière des desormais desquelles desquels dessous dessus deux deuxième -deuxièmement devant devers devra different differentes differents différent +deuxièmement devant devers devra different differente differentes differents différent différente différentes différents dire directe directement dit dite dits divers diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont -douze douzième du duquel durant dès désormais +douze douzième du duquel durant dès déja déjà désormais -effet egale egalement egales eh elle elle-même elles elles-mêmes en encore +effet egalement eh elle elle-meme elle-même elles elles-memes elles-mêmes en encore enfin entre envers environ es ès est et etaient étaient etais étais etait était -etant étant etc été etre être eu eux eux-mêmes exactement excepté +etant étant etc etre être eu eux eux-mêmes exactement excepté également -fais faisaient faisant fait façon feront font +fais faisaient faisant fait facon façon feront font gens @@ -36,45 +37,48 @@ j' j’ je jusqu jusque juste l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps lors lorsque lui lui-meme lui-même là lès -m' m’ ma maint maintenant mais malgre me meme memes merci mes mien +m' m’ ma maint maintenant mais malgre malgré me meme memes merci mes mien mienne miennes miens mille moi moi-meme moi-même moindres moins mon même mêmes n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment -notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres +notre nous nous-mêmes nouveau nul néanmoins nôtre nôtres -o ô on ont onze onzième ore ou ouias oust outre +o ô on ont onze onzième or ou ouias ouste outre ouvert ouverte ouverts où -par parce parfois parle parlent parler parmi parseme partant +par parce parfois parle parlent parler parmi partant pas pendant pense permet personne peu peut peuvent peux plus -plusieurs plutôt possible possibles pour pourquoi -pourrais pourrait pouvait prealable precisement premier première premièrement -pres procedant proche près pu puis puisque +plusieurs plutot plutôt possible possibles pour pourquoi +pourrais pourrait pouvait prealable precisement +premier première premièrement +pres procedant proche près préalable précisement pu puis puisque -qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt +qu' qu’ quand quant quant-à-soi quarante quatorze quatre quatre-vingt quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque quelques quels qui quiconque quinze quoi quoique relative relativement rend rendre restant reste -restent retour revoici revoilà +restent retour revoici revoila revoilà s' s’ sa sait sans sauf se seize selon semblable semblaient semble semblent sent sept septième sera seraient serait seront ses seul seule -seulement si sien sienne siennes siens sinon six sixième soi soi-même soit -soixante son sont sous souvent specifique specifiques stop +seulement seuls seules si sien sienne siennes siens sinon six sixième soi soi-meme soi-même soit +soixante son sont sous souvent specifique specifiques spécifique spécifiques stop suffisant suffisante suffit suis suit suivant suivante suivantes suivants suivre sur surtout t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente -tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous -tout toute toutes treize trente tres trois troisième troisièmement +tes tien tienne tiennes tiens toi toi-meme toi-même ton touchant toujours tous +tout toute toutes treize trente tres trois troisième troisièmement très tu té un une unes uns -va vais vas vers via vingt voici voilà vont vos -votre vous vous-mêmes vu vé vôtre vôtres +va vais vas vers via vingt voici voila voilà vont vos +votre votres vous vous-mêmes vu vé vôtre vôtres + +y """.split() ) diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py new file mode 100644 index 000000000..e29252da9 --- /dev/null +++ b/spacy/lang/grc/__init__.py @@ -0,0 +1,18 @@ +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from ...language import Language + + +class AncientGreekDefaults(Language.Defaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class AncientGreek(Language): + lang = "grc" + Defaults = AncientGreekDefaults + + +__all__ = ["AncientGreek"] diff --git a/spacy/lang/grc/examples.py b/spacy/lang/grc/examples.py new file mode 100644 index 000000000..9c0bcb265 --- /dev/null +++ b/spacy/lang/grc/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.grc.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·", + "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.", + "ὃ μὲν δὴ ἀπόστολος ἐς τὴν Μίλητον ἦν.", + "Θρασύβουλος δὲ σαφέως προπεπυσμένος πάντα λόγον καὶ εἰδὼς τὰ Ἀλυάττης μέλλοι ποιήσειν μηχανᾶται τοιάδε.", + "φιλόπαις δ' ἦν ἐκμανῶς καὶ Ἀλέξανδρος ὁ βασιλεύς.", + "Ἀντίγονος ὁ βασιλεὺς ἐπεκώμαζε τῷ Ζήνωνι", + "αὐτὰρ ὃ δεύτατος ἦλθεν ἄναξ ἀνδρῶν Ἀγαμέμνων ἕλκος ἔχων", +] diff --git a/spacy/lang/grc/lex_attrs.py b/spacy/lang/grc/lex_attrs.py new file mode 100644 index 000000000..0ab15e6fd --- /dev/null +++ b/spacy/lang/grc/lex_attrs.py @@ -0,0 +1,314 @@ +from ...attrs import LIKE_NUM + + +_num_words = [ + # CARDINALS + "εἷς", + "ἑνός", + "ἑνί", + "ἕνα", + "μία", + "μιᾶς", + "μιᾷ", + "μίαν", + "ἕν", + "δύο", + "δυοῖν", + "τρεῖς", + "τριῶν", + "τρισί", + "τρία", + "τέτταρες", + "τεττάρων", + "τέτταρσι", + "τέτταρα", + "τέτταρας", + "πέντε", + "ἕξ", + "ἑπτά", + "ὀκτώ", + "ἐννέα", + "δέκα", + "ἕνδεκα", + "δώδεκα", + "πεντεκαίδεκα", + "ἑκκαίδεκα", + "ἑπτακαίδεκα", + "ὀκτωκαίδεκα", + "ἐννεακαίδεκα", + "εἴκοσι", + "τριάκοντα", + "τετταράκοντα", + "πεντήκοντα", + "ἑξήκοντα", + "ἑβδομήκοντα", + "ὀγδοήκοντα", + "ἐνενήκοντα", + "ἑκατόν", + "διακόσιοι", + "διακοσίων", + "διακοσιᾶν", + "διακοσίους", + "διακοσίοις", + "διακόσια", + "διακόσιαι", + "διακοσίαις", + "διακοσίαισι", + "διηκόσιοι", + "διηκοσίων", + "διηκοσιέων", + "διακοσίας", + "διηκόσια", + "διηκόσιαι", + "διηκοσίας", + "τριακόσιοι", + "τριακοσίων", + "τριακοσιᾶν", + "τριακοσίους", + "τριακοσίοις", + "τριακόσια", + "τριακόσιαι", + "τριακοσίαις", + "τριακοσίαισι", + "τριακοσιέων", + "τριακοσίας", + "τριηκόσια", + "τριηκοσίας", + "τριηκόσιοι", + "τριηκοσίοισιν", + "τριηκοσίους", + "τριηκοσίων", + "τετρακόσιοι", + "τετρακοσίων", + "τετρακοσιᾶν", + "τετρακοσίους", + "τετρακοσίοις", + "τετρακόσια", + "τετρακόσιαι", + "τετρακοσίαις", + "τετρακοσίαισι", + "τετρακοσιέων", + "τετρακοσίας", + "πεντακόσιοι", + "πεντακοσίων", + "πεντακοσιᾶν", + "πεντακοσίους", + "πεντακοσίοις", + "πεντακόσια", + "πεντακόσιαι", + "πεντακοσίαις", + "πεντακοσίαισι", + "πεντακοσιέων", + "πεντακοσίας", + "ἑξακόσιοι", + "ἑξακοσίων", + "ἑξακοσιᾶν", + "ἑξακοσίους", + "ἑξακοσίοις", + "ἑξακόσια", + "ἑξακόσιαι", + "ἑξακοσίαις", + "ἑξακοσίαισι", + "ἑξακοσιέων", + "ἑξακοσίας", + "ἑπτακόσιοι", + "ἑπτακοσίων", + "ἑπτακοσιᾶν", + "ἑπτακοσίους", + "ἑπτακοσίοις", + "ἑπτακόσια", + "ἑπτακόσιαι", + "ἑπτακοσίαις", + "ἑπτακοσίαισι", + "ἑπτακοσιέων", + "ἑπτακοσίας", + "ὀκτακόσιοι", + "ὀκτακοσίων", + "ὀκτακοσιᾶν", + "ὀκτακοσίους", + "ὀκτακοσίοις", + "ὀκτακόσια", + "ὀκτακόσιαι", + "ὀκτακοσίαις", + "ὀκτακοσίαισι", + "ὀκτακοσιέων", + "ὀκτακοσίας", + "ἐνακόσιοι", + "ἐνακοσίων", + "ἐνακοσιᾶν", + "ἐνακοσίους", + "ἐνακοσίοις", + "ἐνακόσια", + "ἐνακόσιαι", + "ἐνακοσίαις", + "ἐνακοσίαισι", + "ἐνακοσιέων", + "ἐνακοσίας", + "χίλιοι", + "χιλίων", + "χιλιῶν", + "χιλίους", + "χιλίοις", + "χίλιαι", + "χιλίας", + "χιλίαις", + "χίλια", + "χίλι", + "δισχίλιοι", + "δισχιλίων", + "δισχιλιῶν", + "δισχιλίους", + "δισχιλίοις", + "δισχίλιαι", + "δισχιλίας", + "δισχιλίαις", + "δισχίλια", + "δισχίλι", + "τρισχίλιοι", + "τρισχιλίων", + "τρισχιλιῶν", + "τρισχιλίους", + "τρισχιλίοις", + "τρισχίλιαι", + "τρισχιλίας", + "τρισχιλίαις", + "τρισχίλια", + "τρισχίλι", + "μύριοι", + "μύριοί", + "μυρίων", + "μυρίοις", + "μυρίους", + "μύριαι", + "μυρίαις", + "μυρίας", + "μύρια", + "δισμύριοι", + "δισμύριοί", + "δισμυρίων", + "δισμυρίοις", + "δισμυρίους", + "δισμύριαι", + "δισμυρίαις", + "δισμυρίας", + "δισμύρια", + "δεκακισμύριοι", + "δεκακισμύριοί", + "δεκακισμυρίων", + "δεκακισμυρίοις", + "δεκακισμυρίους", + "δεκακισμύριαι", + "δεκακισμυρίαις", + "δεκακισμυρίας", + "δεκακισμύρια", + # ANCIENT GREEK NUMBERS (1-100) + "α", + "β", + "γ", + "δ", + "ε", + "ϛ", + "ζ", + "η", + "θ", + "ι", + "ια", + "ιβ", + "ιγ", + "ιδ", + "ιε", + "ιϛ", + "ιζ", + "ιη", + "ιθ", + "κ", + "κα", + "κβ", + "κγ", + "κδ", + "κε", + "κϛ", + "κζ", + "κη", + "κθ", + "λ", + "λα", + "λβ", + "λγ", + "λδ", + "λε", + "λϛ", + "λζ", + "λη", + "λθ", + "μ", + "μα", + "μβ", + "μγ", + "μδ", + "με", + "μϛ", + "μζ", + "μη", + "μθ", + "ν", + "να", + "νβ", + "νγ", + "νδ", + "νε", + "νϛ", + "νζ", + "νη", + "νθ", + "ξ", + "ξα", + "ξβ", + "ξγ", + "ξδ", + "ξε", + "ξϛ", + "ξζ", + "ξη", + "ξθ", + "ο", + "οα", + "οβ", + "ογ", + "οδ", + "οε", + "οϛ", + "οζ", + "οη", + "οθ", + "π", + "πα", + "πβ", + "πγ", + "πδ", + "πε", + "πϛ", + "πζ", + "πη", + "πθ", + "ϟ", + "ϟα", + "ϟβ", + "ϟγ", + "ϟδ", + "ϟε", + "ϟϛ", + "ϟζ", + "ϟη", + "ϟθ", + "ρ", +] + + +def like_num(text): + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/grc/stop_words.py b/spacy/lang/grc/stop_words.py new file mode 100644 index 000000000..cbb766a8c --- /dev/null +++ b/spacy/lang/grc/stop_words.py @@ -0,0 +1,61 @@ +STOP_WORDS = set( + """ +αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ +αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς +αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν + +γε γ' γέ γὰρ γάρ δαῖτα δαιτὸς δαιτὶ δαὶ δαιτί δαῖτ' δαΐδας δαΐδων δἰ διὰ διά δὲ δ' δέ δὴ δή εἰ εἴ κεἰ κεἴ αἴ αἲ εἲ αἰ + +ἐστί ἐστιν ὢν ἦν ἐστὶν ὦσιν εἶναι ὄντι εἰσιν ἐστι ὄντα οὖσαν ἦσαν ἔστι ὄντας ἐστὲ εἰσὶ εἶ ὤν ἦ οὖσαι ἔσται ἐσμὲν ἐστ' ἐστίν ἔστ' ὦ ἔσει ἦμεν εἰμι εἰσὶν ἦσθ' +ἐστὶ ᾖ οὖσ' ἔστιν εἰμὶ εἴμ' ἐσθ' ᾖς στί εἴην εἶναί οὖσα κἄστ' εἴη ἦσθα εἰμ' ἔστω ὄντ' ἔσθ' ἔμμεναι ἔω ἐὼν ἐσσι ἔσσεται ἐστὸν ἔσαν ἔστων ἐόντα ἦεν ἐοῦσαν ἔην +ἔσσομαι εἰσί ἐστόν ἔσκεν ἐόντ' ἐών ἔσσεσθ' εἰσ' ἐόντες ἐόντε ἐσσεῖται εἰμεν ἔασιν ἔσκε ἔμεναι ἔσεσθαι ἔῃ εἰμὲν εἰσι ἐόντας ἔστε εἰς ἦτε εἰμί ἔσσεαι ἔμμεν +ἐοῦσα ἔμεν ᾖσιν ἐστε ἐόντι εἶεν ἔσσονται ἔησθα ἔσεσθε ἐσσί ἐοῦσ' ἔασι ἔα ἦα ἐόν ἔσσεσθαι ἔσομαι ἔσκον εἴης ἔωσιν εἴησαν ἐὸν ἐουσέων ἔσσῃ ἐούσης ἔσονται +ἐούσας ἐόντων ἐόντος ἐσομένην ἔστωσαν ἔωσι ἔας ἐοῦσαι ἣν εἰσίν ἤστην ὄντες ὄντων οὔσας οὔσαις ὄντος οὖσι οὔσης ἔσῃ ὂν ἐσμεν ἐσμέν οὖσιν ἐσομένους ἐσσόμεσθα + +ἒς ἐς ἔς ἐν κεἰς εἲς κἀν ἔν κατὰ κατ' καθ' κατά κάτα κὰπ κὰκ κὰδ κὰρ κάρ κὰγ κὰμ καὶ καί μετὰ μεθ' μετ' μέτα μετά μέθ' μέτ' μὲν μέν μὴ + +μή μη οὐκ οὒ οὐ οὐχ οὐχὶ κοὐ κοὐχ οὔ κοὐκ οὐχί οὐκὶ οὐδὲν οὐδεὶς οὐδέν κοὐδεὶς κοὐδὲν οὐδένα οὐδενὸς οὐδέν' οὐδενός οὐδενὶ +οὐδεμία οὐδείς οὐδεμίαν οὐδὲ οὐδ' κοὐδ' οὐδέ οὔτε οὔθ' οὔτέ τε οὔτ' οὕτως οὕτω οὕτῶ χοὔτως οὖν ὦν ὧν τοῦτο τοῦθ' τοῦτον τούτῳ +τούτοις ταύτας αὕτη ταῦτα οὗτος ταύτης ταύτην τούτων ταῦτ' τοῦτ' τούτου αὗται τούτους τοῦτό ταῦτά τούτοισι χαὔτη ταῦθ' χοὖτοι +τούτοισιν οὗτός οὗτοι τούτω τουτέων τοῦτὸν οὗτοί τοῦτου οὗτοὶ ταύτῃσι ταύταις ταυτὶ παρὰ παρ' πάρα παρά πὰρ παραὶ πάρ' περὶ +πέρι περί πρὸς πρός ποτ' ποτὶ προτὶ προτί πότι + +σὸς σήν σὴν σὸν σόν σὰ σῶν σοῖσιν σός σῆς σῷ σαῖς σῇ σοῖς σοῦ σ' σὰν σά σὴ σὰς +σᾷ σοὺς σούς σοῖσι σῇς σῇσι σή σῇσιν σοὶ σου ὑμεῖς σὲ σύ σοι ὑμᾶς ὑμῶν ὑμῖν σε +σέ σὺ σέθεν σοί ὑμὶν σφῷν ὑμίν τοι τοὶ σφὼ ὔμμ' σφῶϊ σεῖο τ' σφῶϊν ὔμμιν σέο σευ σεῦ +ὔμμι ὑμέων τύνη ὑμείων τοί ὔμμες σεο τέ τεοῖο ὑμέας σὺν ξὺν σύν + +θ' τί τι τις τινες τινα τινος τινὸς τινὶ τινῶν τίς τίνες τινὰς τιν' τῳ του τίνα τοῦ τῷ τινί τινά τίνος τινι τινας τινὰ τινων +τίν' τευ τέο τινές τεο τινὲς τεῷ τέῳ τινός τεῳ τισὶ + +τοιαῦτα τοιοῦτον τοιοῦθ' τοιοῦτος τοιαύτην τοιαῦτ' τοιούτου τοιαῦθ' τοιαύτῃ τοιούτοις τοιαῦται τοιαῦτά τοιαύτη τοιοῦτοι τοιούτων τοιούτοισι +τοιοῦτο τοιούτους τοιούτῳ τοιαύτης τοιαύταις τοιαύτας τοιοῦτός τίνι τοῖσι τίνων τέων τέοισί τὰ τῇ τώ τὼ + +ἀλλὰ ἀλλ' ἀλλά ἀπ' ἀπὸ κἀπ' ἀφ' τἀπὸ κἀφ' ἄπο ἀπό τὠπὸ τἀπ' ἄλλων ἄλλῳ ἄλλη ἄλλης ἄλλους ἄλλοις ἄλλον ἄλλο ἄλλου τἄλλα ἄλλα +ἄλλᾳ ἄλλοισιν τἄλλ' ἄλλ' ἄλλος ἄλλοισι κἄλλ' ἄλλοι ἄλλῃσι ἄλλόν ἄλλην ἄλλά ἄλλαι ἄλλοισίν ὧλλοι ἄλλῃ ἄλλας ἀλλέων τἆλλα ἄλλως +ἀλλάων ἄλλαις τἆλλ' + +ἂν ἄν κἂν τἂν ἃν κεν κ' κέν κέ κε χ' ἄρα τἄρα ἄρ' τἄρ' ἄρ ῥα ῥά ῥ τὰρ ἄρά ἂρ + +ἡμᾶς με ἐγὼ ἐμὲ μοι κἀγὼ ἡμῶν ἡμεῖς ἐμοὶ ἔγωγ' ἁμοὶ ἡμῖν μ' ἔγωγέ ἐγώ ἐμοί ἐμοῦ κἀμοῦ ἔμ' κἀμὲ ἡμὶν μου ἐμέ ἔγωγε νῷν νὼ χἠμεῖς ἁμὲ κἀγώ κἀμοὶ χἠμᾶς +ἁγὼ ἡμίν κἄμ' ἔμοιγ' μοί τοὐμὲ ἄμμε ἐγὼν ἐμεῦ ἐμεῖο μευ ἔμοιγε ἄμμι μέ ἡμέας νῶϊ ἄμμιν ἧμιν ἐγών νῶΐ ἐμέθεν ἥμιν ἄμμες νῶι ἡμείων ἄμμ' ἡμέων ἐμέο +ἐκ ἔκ ἐξ κἀκ κ ἃκ κἀξ ἔξ εξ Ἐκ τἀμὰ ἐμοῖς τοὐμόν ἐμᾶς τοὐμὸν ἐμῶν ἐμὸς ἐμῆς ἐμῷ τὠμῷ ἐμὸν τἄμ' ἐμὴ ἐμὰς ἐμαῖς ἐμὴν ἐμόν ἐμὰ ἐμός ἐμοὺς ἐμῇ ἐμᾷ +οὑμὸς ἐμοῖν οὑμός κἀμὸν ἐμαὶ ἐμή ἐμάς ἐμοῖσι ἐμοῖσιν ἐμῇσιν ἐμῇσι ἐμῇς ἐμήν + +ἔνι ἐνὶ εἰνὶ εἰν ἐμ ἐπὶ ἐπ' ἔπι ἐφ' κἀπὶ τἀπὶ ἐπί ἔφ' ἔπ' ἐὰν ἢν ἐάν ἤν ἄνπερ + +αὑτοῖς αὑτὸν αὑτῷ ἑαυτοῦ αὑτόν αὑτῆς αὑτῶν αὑτοῦ αὑτὴν αὑτοῖν χαὐτοῦ αὑταῖς ἑωυτοῦ ἑωυτῇ ἑωυτὸν ἐωυτῷ ἑωυτῆς ἑωυτόν ἑωυτῷ +ἑωυτάς ἑωυτῶν ἑωυτοὺς ἑωυτοῖσι ἑαυτῇ ἑαυτούς αὑτοὺς ἑαυτῶν ἑαυτοὺς ἑαυτὸν ἑαυτῷ ἑαυτοῖς ἑαυτὴν ἑαυτῆς + +ἔτι ἔτ' ἔθ' κἄτι ἢ ἤ ἠέ ἠὲ ἦε ἦέ ἡ τοὺς τὴν τὸ τῶν τὸν ὁ ἁ οἱ τοῖς ταῖς τῆς τὰς αἱ τό τὰν τᾶς τοῖσιν αἳ χὠ τήν τά τοῖν τάς ὅ +χοἰ ἣ ἥ χἠ τάν τᾶν ὃ οἳ οἵ τοῖο τόν τοῖιν τούς τάων ταὶ τῇς τῇσι τῇσιν αἵ τοῖό τοῖσίν ὅττί ταί Τὴν τῆ τῶ τάδε ὅδε τοῦδε τόδε τόνδ' +τάδ' τῆσδε τῷδε ὅδ' τῶνδ' τῇδ' τοῦδέ τῶνδε τόνδε τόδ' τοῦδ' τάσδε τήνδε τάσδ' τήνδ' ταῖσδέ τῇδε τῆσδ' τάνδ' τῷδ' τάνδε ἅδε τοῖσδ' ἥδ' +τᾷδέ τοῖσδε τούσδ' ἥδε τούσδε τώδ' ἅδ' οἵδ' τῶνδέ οἵδε τᾷδε τοῖσδεσσι τώδε τῇδέ τοῖσιδε αἵδε τοῦδὲ τῆδ' αἵδ' τοῖσδεσι ὃν ἃ ὃς ᾧ οὗ ἅπερ +οὓς ἧς οἷς ἅσπερ ᾗ ἅ χὦνπερ ὣ αἷς ᾇ ὅς ἥπερ ἃς ὅσπερ ὅνπερ ὧνπερ ᾧπερ ὅν αἷν οἷσι ἇς ἅς ὥ οὕς ἥν οἷσιν ἕης ὅου ᾗς οἷσί οἷσίν τοῖσί ᾗσιν οἵπερ αἷσπερ +ὅστις ἥτις ὅτου ὅτοισι ἥντιν' ὅτῳ ὅντιν' ὅττι ἅσσά ὅτεῳ ὅτις ὅτιν' ὅτευ ἥντινα αἵτινές ὅντινα ἅσσα ᾧτινι οἵτινες ὅτι ἅτις ὅτ' ὑμὴ +ὑμήν ὑμὸν ὑπὲρ ὕπερ ὑπέρτερον ὑπεὶρ ὑπέρτατος ὑπὸ ὑπ' ὑφ' ὕπο ὑπαὶ ὑπό ὕπ' ὕφ' + + ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ + + """.split() +) diff --git a/spacy/lang/grc/tokenizer_exceptions.py b/spacy/lang/grc/tokenizer_exceptions.py new file mode 100644 index 000000000..230a58fd2 --- /dev/null +++ b/spacy/lang/grc/tokenizer_exceptions.py @@ -0,0 +1,115 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import ORTH, NORM +from ...util import update_exc + +_exc = {} + +for token in ["᾽Απ'", "᾽ΑΠ'", "ἀφ'", "᾽Αφ", "ἀπὸ"]: + _exc[token] = [{ORTH: token, NORM: "από"}] + +for token in ["᾽Αλλ'", "ἀλλ'", "ἀλλὰ"]: + _exc[token] = [{ORTH: token, NORM: "ἀλλά"}] + +for token in ["παρ'", "Παρ'", "παρὰ", "παρ"]: + _exc[token] = [{ORTH: token, NORM: "παρά"}] + +for token in ["καθ'", "Καθ'", "κατ'", "Κατ'", "κατὰ"]: + _exc[token] = [{ORTH: token, NORM: "κατά"}] + +for token in ["Ἐπ'", "ἐπ'", "ἐπὶ", "Εφ'", "εφ'"]: + _exc[token] = [{ORTH: token, NORM: "επί"}] + +for token in ["Δι'", "δι'", "διὰ"]: + _exc[token] = [{ORTH: token, NORM: "διά"}] + +for token in ["Ὑπ'", "ὑπ'", "ὑφ'"]: + _exc[token] = [{ORTH: token, NORM: "ὑπό"}] + +for token in ["Μετ'", "μετ'", "μεθ'", "μετὰ"]: + _exc[token] = [{ORTH: token, NORM: "μετά"}] + +for token in ["Μ'", "μ'", "μέ", "μὲ"]: + _exc[token] = [{ORTH: token, NORM: "με"}] + +for token in ["Σ'", "σ'", "σέ", "σὲ"]: + _exc[token] = [{ORTH: token, NORM: "σε"}] + +for token in ["Τ'", "τ'", "τέ", "τὲ"]: + _exc[token] = [{ORTH: token, NORM: "τε"}] + +for token in ["Δ'", "δ'", "δὲ"]: + _exc[token] = [{ORTH: token, NORM: "δέ"}] + + +_other_exc = { + "μὲν": [{ORTH: "μὲν", NORM: "μέν"}], + "μὴν": [{ORTH: "μὴν", NORM: "μήν"}], + "τὴν": [{ORTH: "τὴν", NORM: "τήν"}], + "τὸν": [{ORTH: "τὸν", NORM: "τόν"}], + "καὶ": [{ORTH: "καὶ", NORM: "καί"}], + "καὐτός": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτός"}], + "καὐτὸς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "αὐτὸς", NORM: "αὐτός"}], + "κοὐ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "οὐ"}], + "χἡ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἡ"}], + "χοἱ": [{ORTH: "χ", NORM: "καί"}, {ORTH: "οἱ"}], + "χἱκετεύετε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ἱκετεύετε"}], + "κἀν": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀν", NORM: "ἐν"}], + "κἀγὼ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γὼ", NORM: "ἐγώ"}], + "κἀγώ": [{ORTH: "κἀ", NORM: "καί"}, {ORTH: "γώ", NORM: "ἐγώ"}], + "ἁγώ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γώ", NORM: "ἐγώ"}], + "ἁγὼ": [{ORTH: "ἁ", NORM: "ἃ"}, {ORTH: "γὼ", NORM: "ἐγώ"}], + "ἐγᾦδα": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦδα", NORM: "οἶδα"}], + "ἐγᾦμαι": [{ORTH: "ἐγ", NORM: "ἐγώ"}, {ORTH: "ᾦμαι", NORM: "οἶμαι"}], + "κἀς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ἀς", NORM: "ἐς"}], + "κᾆτα": [{ORTH: "κ", NORM: "καί"}, {ORTH: "ᾆτα", NORM: "εἶτα"}], + "κεἰ": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰ"}], + "κεἰς": [{ORTH: "κ", NORM: "καί"}, {ORTH: "εἰς"}], + "χὤτε": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτε", NORM: "ὅτε"}], + "χὤπως": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤπως", NORM: "ὅπως"}], + "χὤτι": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤτι", NORM: "ὅτι"}], + "χὤταν": [{ORTH: "χ", NORM: "καί"}, {ORTH: "ὤταν", NORM: "ὅταν"}], + "οὑμός": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μός", NORM: "ἐμός"}], + "οὑμὸς": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "μὸς", NORM: "ἐμός"}], + "οὑμοί": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοί", NORM: "ἐμoί"}], + "οὑμοὶ": [{ORTH: "οὑ", NORM: "οἱ"}, {ORTH: "μοὶ", NORM: "ἐμoί"}], + "σοὔστι": [{ORTH: "σοὔ", NORM: "σοί"}, {ORTH: "στι", NORM: "ἐστι"}], + "σοὐστί": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στί", NORM: "ἐστί"}], + "σοὐστὶ": [{ORTH: "σοὐ", NORM: "σοί"}, {ORTH: "στὶ", NORM: "ἐστί"}], + "μοὖστι": [{ORTH: "μοὖ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}], + "μοὔστι": [{ORTH: "μοὔ", NORM: "μοί"}, {ORTH: "στι", NORM: "ἐστι"}], + "τοὔνομα": [{ORTH: "τοὔ", NORM: "τό"}, {ORTH: "νομα", NORM: "ὄνομα"}], + "οὑν": [{ORTH: "οὑ", NORM: "ὁ"}, {ORTH: "ν", NORM: "ἐν"}], + "ὦνερ": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νερ", NORM: "ἄνερ"}], + "ὦνδρες": [{ORTH: "ὦ", NORM: "ὦ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}], + "προὔχων": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χων", NORM: "ἔχων"}], + "προὔχοντα": [{ORTH: "προὔ", NORM: "πρό"}, {ORTH: "χοντα", NORM: "ἔχοντα"}], + "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}], + "θοἰμάτιον": [{ORTH: "θο", NORM: "τό"}, {ORTH: "ἰμάτιον"}], + "ὥνεκα": [{ORTH: "ὥ", NORM: "οὗ"}, {ORTH: "νεκα", NORM: "ἕνεκα"}], + "τὠληθές": [{ORTH: "τὠ", NORM: "τὸ"}, {ORTH: "ληθές", NORM: "ἀληθές"}], + "θἡμέρᾳ": [{ORTH: "θ", NORM: "τῇ"}, {ORTH: "ἡμέρᾳ"}], + "ἅνθρωπος": [{ORTH: "ἅ", NORM: "ὁ"}, {ORTH: "νθρωπος", NORM: "ἄνθρωπος"}], + "τἄλλα": [{ORTH: "τ", NORM: "τὰ"}, {ORTH: "ἄλλα"}], + "τἆλλα": [{ORTH: "τἆ", NORM: "τὰ"}, {ORTH: "λλα", NORM: "ἄλλα"}], + "ἁνήρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νήρ", NORM: "ἀνήρ"}], + "ἁνὴρ": [{ORTH: "ἁ", NORM: "ὁ"}, {ORTH: "νὴρ", NORM: "ἀνήρ"}], + "ἅνδρες": [{ORTH: "ἅ", NORM: "οἱ"}, {ORTH: "νδρες", NORM: "ἄνδρες"}], + "ἁγαθαί": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαί", NORM: "ἀγαθαί"}], + "ἁγαθαὶ": [{ORTH: "ἁ", NORM: "αἱ"}, {ORTH: "γαθαὶ", NORM: "ἀγαθαί"}], + "ἁλήθεια": [{ORTH: "ἁ", NORM: "ἡ"}, {ORTH: "λήθεια", NORM: "ἀλήθεια"}], + "τἀνδρός": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρός"}], + "τἀνδρὸς": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "ἀνδρὸς", NORM: "ἀνδρός"}], + "τἀνδρί": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρί"}], + "τἀνδρὶ": [{ORTH: "τ", NORM: "τῷ"}, {ORTH: "ἀνδρὶ", NORM: "ἀνδρί"}], + "αὑτός": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τός", NORM: "αὐτός"}], + "αὑτὸς": [{ORTH: "αὑ", NORM: "ὁ"}, {ORTH: "τὸς", NORM: "αὐτός"}], + "ταὐτοῦ": [{ORTH: "τ", NORM: "τοῦ"}, {ORTH: "αὐτοῦ"}], +} + +_exc.update(_other_exc) + +_exc_data = {} + +_exc.update(_exc_data) + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 25cbaa651..672a8698e 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,7 +1,11 @@ +from typing import Optional +from thinc.api import Model + from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language +from .lemmatizer import ItalianLemmatizer class ItalianDefaults(Language.Defaults): @@ -16,4 +20,16 @@ class Italian(Language): Defaults = ItalianDefaults +@Italian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Italian"] diff --git a/spacy/lang/it/lemmatizer.py b/spacy/lang/it/lemmatizer.py new file mode 100644 index 000000000..e44e64e3a --- /dev/null +++ b/spacy/lang/it/lemmatizer.py @@ -0,0 +1,132 @@ +from typing import List, Dict, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class ItalianLemmatizer(Lemmatizer): + """This lemmatizer was adapted from the Polish one (version of April 2021). + It implements lookup lemmatization based on the morphological lexicon + morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware + entries is used as a backup for words that aren't handled by morph-it.""" + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "pos_lookup": + required = [ + "lemma_lookup_num", + "lemma_lookup_det", + "lemma_lookup_adp", + "lemma_lookup_adj", + "lemma_lookup_noun", + "lemma_lookup_pron", + "lemma_lookup_verb", + "lemma_lookup_aux", + "lemma_lookup_adv", + "lemma_lookup_other", + "lemma_lookup", + ] + return (required, []) + else: + return super().get_lookups_config(mode) + + def pos_lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + elif univ_pos == "PART": + lookup_pos = "pron" + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + if univ_pos == "NOUN": + return self.lemmatize_noun(string, morphology, lookup_table) + else: + if univ_pos != "PROPN": + string = string.lower() + if univ_pos == "DET": + return self.lemmatize_det(string, morphology, lookup_table) + elif univ_pos == "PRON": + return self.lemmatize_pron(string, morphology, lookup_table) + elif univ_pos == "ADP": + return self.lemmatize_adp(string, morphology, lookup_table) + elif univ_pos == "ADJ": + return self.lemmatize_adj(string, morphology, lookup_table) + else: + lemma = lookup_table.get(string, "") + if not lemma: + lookup_table = self.lookups.get_table("lemma_lookup_other") + lemma = lookup_table.get(string, "") + if not lemma: + lookup_table = self.lookups.get_table( + "lemma_lookup" + ) # "legacy" lookup table + lemma = lookup_table.get(string, string.lower()) + return [lemma] + + def lemmatize_det( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + if string in [ + "l'", + "lo", + "la", + "i", + "gli", + "le", + ]: + return ["il"] + if string in ["un'", "un", "una"]: + return ["uno"] + return [lookup_table.get(string, string)] + + def lemmatize_pron( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + if string in [ + "l'", + "li", + "la", + "gli", + "le", + ]: + return ["lo"] + if string in ["un'", "un", "una"]: + return ["uno"] + lemma = lookup_table.get(string, string) + if lemma == "alcun": + lemma = "alcuno" + elif lemma == "qualcun": + lemma = "qualcuno" + return [lemma] + + def lemmatize_adp( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + if string == "d'": + return ["di"] + return [lookup_table.get(string, string)] + + def lemmatize_adj( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + lemma = lookup_table.get(string, string) + if lemma == "alcun": + lemma = "alcuno" + elif lemma == "qualcun": + lemma = "qualcuno" + return [lemma] + + def lemmatize_noun( + self, string: str, morphology: dict, lookup_table: Dict[str, str] + ) -> List[str]: + # this method is case-sensitive, in order to work + # for incorrectly tagged proper names + if string != string.lower(): + if string.lower() in lookup_table: + return [lookup_table[string.lower()]] + elif string in lookup_table: + return [lookup_table[string]] + return [string.lower()] + return [lookup_table.get(string, string)] diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 87c2929bf..42883863b 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -25,7 +25,7 @@ for orth in [ "artt.", "att.", "avv.", - "Avv." + "Avv.", "by-pass", "c.d.", "c/c", diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index 9b800029c..8f2933670 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -27,7 +27,7 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 7fff3c3d2..5e95b4a8b 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,12 +1,14 @@ from typing import Optional + from thinc.api import Model -from .stop_words import STOP_WORDS +from .lemmatizer import DutchLemmatizer from .lex_attrs import LEX_ATTRS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES -from .lemmatizer import DutchLemmatizer +from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import Language @@ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS stop_words = STOP_WORDS diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py new file mode 100644 index 000000000..1959ba6e2 --- /dev/null +++ b/spacy/lang/nl/syntax_iterators.py @@ -0,0 +1,72 @@ +from typing import Union, Iterator + +from ...symbols import NOUN, PRON +from ...errors import Errors +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: + """ + Detect base noun phrases from a dependency parse. Works on Doc and Span. + The definition is inspired by https://www.nltk.org/book/ch07.html + Consider : [Noun + determinant / adjective] and also [Pronoun] + """ + # fmt: off + # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on + doc = doclike.doc # Ensure works on both Doc and Span. + + # Check for dependencies: POS, DEP + if not doc.has_annotation("POS"): + raise ValueError(Errors.E1019) + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + # See UD tags: https://universaldependencies.org/u/dep/index.html + # amod = adjectival modifier + # nmod:poss = possessive nominal modifier + # nummod = numeric modifier + # det = determiner + # det:poss = possessive determiner + noun_deps = [ + doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"] + ] + + # nsubj = nominal subject + # nsubj:pass = passive nominal subject + pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]] + + # Label NP for the Span to identify it as Noun-Phrase + span_label = doc.vocab.strings.add("NP") + + # Only NOUNS and PRONOUNS matter + for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): + # For NOUNS + # Pick children from syntactic parse (only those with certain dependencies) + if word.pos == NOUN: + # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS + # We check if the word has a "nsubj", if it's the case, we eliminate it + nsubjs = filter( + lambda x: x.dep == doc.vocab.strings["nsubj"], word.children + ) + next_word = next(nsubjs, None) + if next_word is not None: + # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN + continue + + children = filter(lambda x: x.dep in noun_deps, word.children) + children_i = [c.i for c in children] + [word.i] + + start_span = min(children_i) + end_span = max(children_i) + 1 + yield start_span, end_span, span_label + + # PRONOUNS only if it is the subject of a verb + elif word.pos == PRON: + if word.dep in pronoun_deps: + start_span = word.i + end_span = word.i + 1 + yield start_span, end_span, span_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index c337b9bc3..399cd174c 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -12,8 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'} class RussianLemmatizer(Lemmatizer): - _morph = None - def __init__( self, vocab: Vocab, @@ -23,15 +21,16 @@ class RussianLemmatizer(Lemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Russian lemmatizer requires the pymorphy2 library: " - 'try to fix it with "pip install pymorphy2"' - ) from None - if RussianLemmatizer._morph is None: - RussianLemmatizer._morph = MorphAnalyzer() + if mode == "pymorphy2": + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library. Install it with: pip install pymorphy2" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer() super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) def pymorphy2_lemmatize(self, token: Token) -> List[str]: diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 960302513..e41db911f 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -35,8 +35,8 @@ URL_PATTERN = ( # host & domain names # mods: match is case-sensitive, so include [A-Z] r"(?:" # noqa: E131 - r"(?:" - r"[A-Za-z0-9\u00a1-\uffff]" + r"(?:" # noqa: E131 + r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131 r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" r")?" r"[A-Za-z0-9\u00a1-\uffff]\." diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 0b4435a21..1fb030e06 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -7,8 +7,6 @@ from ...vocab import Vocab class UkrainianLemmatizer(RussianLemmatizer): - _morph = None - def __init__( self, vocab: Vocab, @@ -18,14 +16,15 @@ class UkrainianLemmatizer(RussianLemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Ukrainian lemmatizer requires the pymorphy2 library and " - "dictionaries: try to fix it with " - '"pip install pymorphy2 pymorphy2-dicts-uk"' - ) from None - if UkrainianLemmatizer._morph is None: - UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") + if mode == "pymorphy2": + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Ukrainian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library and dictionaries. Install them with: " + "pip install pymorphy2 pymorphy2-dicts-uk" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer(lang="uk") super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/language.py b/spacy/language.py index 7786089a5..589dca2bf 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,4 +1,5 @@ -from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern +from typing import Iterator, Optional, Any, Dict, Callable, Iterable, TypeVar +from typing import Union, List, Pattern, overload from typing import Tuple from dataclasses import dataclass import random @@ -13,6 +14,7 @@ import srsly import multiprocessing as mp from itertools import chain, cycle from timeit import default_timer as timer +import traceback from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab @@ -433,9 +435,9 @@ class Language: default_config (Dict[str, Any]): Default configuration, describing the default values of the factory arguments. assigns (Iterable[str]): Doc/Token attributes assigned by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. requires (Iterable[str]): Doc/Token attributes required by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. default_score_weights (Dict[str, float]): The scores to report during @@ -518,9 +520,9 @@ class Language: name (str): The name of the component factory. assigns (Iterable[str]): Doc/Token attributes assigned by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. requires (Iterable[str]): Doc/Token attributes required by this component, - e.g. "token.ent_id". Used for pipeline analyis. + e.g. "token.ent_id". Used for pipeline analysis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. func (Optional[Callable]): Factory function if not used as a decorator. @@ -686,11 +688,13 @@ class Language: if not isinstance(source, Language): raise ValueError(Errors.E945.format(name=source_name, source=type(source))) # Check vectors, with faster checks first - if self.vocab.vectors.shape != source.vocab.vectors.shape or \ - self.vocab.vectors.key2row != source.vocab.vectors.key2row or \ - self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes(): + if ( + self.vocab.vectors.shape != source.vocab.vectors.shape + or self.vocab.vectors.key2row != source.vocab.vectors.key2row + or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes() + ): warnings.warn(Warnings.W113.format(name=source_name)) - if not source_name in source.component_names: + if source_name not in source.component_names: raise KeyError( Errors.E944.format( name=source_name, @@ -868,14 +872,14 @@ class Language: DOCS: https://spacy.io/api/language#replace_pipe """ - if name not in self.pipe_names: + if name not in self.component_names: raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) if hasattr(factory_name, "__call__"): err = Errors.E968.format(component=repr(factory_name), name=name) raise ValueError(err) # We need to delegate to Language.add_pipe here instead of just writing # to Language.pipeline to make sure the configs are handled correctly - pipe_index = self.pipe_names.index(name) + pipe_index = self.component_names.index(name) self.remove_pipe(name) if not len(self._components) or pipe_index == len(self._components): # we have no components to insert before/after, or we're replacing the last component @@ -931,6 +935,7 @@ class Language: # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) + self.meta.get("_sourced_vectors_hashes", {}).pop(name, None) # Make sure name is removed from the [initialize] config if name in self._config["initialize"]["components"]: self._config["initialize"]["components"].pop(name) @@ -1427,7 +1432,22 @@ class Language: except StopIteration: pass + _AnyContext = TypeVar("_AnyContext") + + @overload def pipe( + self, + texts: Iterable[Tuple[str, _AnyContext]], + *, + as_tuples: bool = ..., + batch_size: Optional[int] = ..., + disable: Iterable[str] = ..., + component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., + n_process: int = ..., + ) -> Iterator[Tuple[Doc, _AnyContext]]: + ... + + def pipe( # noqa: F811 self, texts: Iterable[str], *, @@ -1436,7 +1456,7 @@ class Language: disable: Iterable[str] = SimpleFrozenList(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, n_process: int = 1, - ): + ) -> Iterator[Doc]: """Process texts as a stream, and yield `Doc` objects in order. texts (Iterable[str]): A sequence of texts to process. @@ -1538,11 +1558,21 @@ class Language: # Cycle channels not to break the order of docs. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. - byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) - docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs) + byte_tuples = chain.from_iterable( + recv.recv() for recv in cycle(bytedocs_recv_ch) + ) try: - for i, (_, doc) in enumerate(zip(raw_texts, docs), 1): - yield doc + for i, (_, (byte_doc, byte_error)) in enumerate( + zip(raw_texts, byte_tuples), 1 + ): + if byte_doc is not None: + doc = Doc(self.vocab).from_bytes(byte_doc) + yield doc + elif byte_error is not None: + error = srsly.msgpack_loads(byte_error) + self.default_error_handler( + None, None, None, ValueError(Errors.E871.format(error=error)) + ) if i % batch_size == 0: # tell `sender` that one batch was consumed. sender.step() @@ -1667,6 +1697,8 @@ class Language: # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} + source_nlp_vectors_hashes = {} + nlp.meta["_sourced_vectors_hashes"] = {} for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) @@ -1691,17 +1723,33 @@ class Language: else: model = pipe_cfg["source"] if model not in source_nlps: - # We only need the components here and we need to init - # model with the same vocab as the current nlp object - source_nlps[model] = util.load_model(model, vocab=nlp.vocab) + # We only need the components here and we intentionally + # do not load the model with the same vocab because + # this would cause the vectors to be copied into the + # current nlp object (all the strings will be added in + # create_pipe_from_source) + source_nlps[model] = util.load_model(model) source_name = pipe_cfg.get("component", pipe_name) listeners_replaced = False if "replace_listeners" in pipe_cfg: for name, proc in source_nlps[model].pipeline: if source_name in getattr(proc, "listening_components", []): - source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"]) + source_nlps[model].replace_listeners( + name, source_name, pipe_cfg["replace_listeners"] + ) listeners_replaced = True - nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W113\\]") + nlp.add_pipe( + source_name, source=source_nlps[model], name=pipe_name + ) + if model not in source_nlp_vectors_hashes: + source_nlp_vectors_hashes[model] = hash( + source_nlps[model].vocab.vectors.to_bytes() + ) + nlp.meta["_sourced_vectors_hashes"][ + pipe_name + ] = source_nlp_vectors_hashes[model] # Delete from cache if listeners were replaced if listeners_replaced: del source_nlps[model] @@ -1719,12 +1767,16 @@ class Language: for name, proc in nlp.pipeline: # Remove listeners not in the pipeline listener_names = getattr(proc, "listening_components", []) - unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names] + unused_listener_names = [ + ll for ll in listener_names if ll not in nlp.pipe_names + ] for listener_name in unused_listener_names: for listener in proc.listener_map.get(listener_name, []): proc.remove_listener(listener, listener_name) - for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer + for listener in getattr( + proc, "listening_components", [] + ): # e.g. tok2vec/transformer # If it's a component sourced from another pipeline, we check if # the tok2vec listeners should be replaced with standalone tok2vec # models (e.g. so component can be frozen without its performance @@ -1781,6 +1833,7 @@ class Language: raise ValueError(err) tok2vec = self.get_pipe(tok2vec_name) tok2vec_cfg = self.get_pipe_config(tok2vec_name) + tok2vec_model = tok2vec.model if ( not hasattr(tok2vec, "model") or not hasattr(tok2vec, "listener_map") @@ -1789,6 +1842,7 @@ class Language: ): raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec))) pipe_listeners = tok2vec.listener_map.get(pipe_name, []) + pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: util.logger.debug(f"Replacing listeners of component '{pipe_name}'") @@ -1803,7 +1857,6 @@ class Language: n_listeners=len(pipe_listeners), ) raise ValueError(err) - pipe = self.get_pipe(pipe_name) # Update the config accordingly by copying the tok2vec model to all # sections defined in the listener paths for listener_path in listeners: @@ -1815,10 +1868,19 @@ class Language: name=pipe_name, tok2vec=tok2vec_name, path=listener_path ) raise ValueError(err) - util.set_dot_to_object(pipe_cfg, listener_path, tok2vec_cfg["model"]) + new_config = tok2vec_cfg["model"] + if "replace_listener_cfg" in tok2vec_model.attrs: + replace_func = tok2vec_model.attrs["replace_listener_cfg"] + new_config = replace_func( + tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"] + ) + util.set_dot_to_object(pipe_cfg, listener_path, new_config) # Go over the listener layers and replace them for listener in pipe_listeners: - util.replace_model_node(pipe.model, listener, tok2vec.model.copy()) + new_model = tok2vec_model.copy() + if "replace_listener" in tok2vec_model.attrs: + new_model = tok2vec_model.attrs["replace_listener"](new_model) + util.replace_model_node(pipe.model, listener, new_model) tok2vec.remove_listener(listener, pipe_name) def to_disk( @@ -1850,7 +1912,11 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + self, + path: Union[str, Path], + *, + exclude: Iterable[str] = SimpleFrozenList(), + overrides: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the @@ -1879,7 +1945,7 @@ class Language: deserializers = {} if Path(path / "config.cfg").exists(): deserializers["config.cfg"] = lambda p: self.config.from_disk( - p, interpolate=False + p, interpolate=False, overrides=overrides ) deserializers["meta.json"] = deserialize_meta deserializers["vocab"] = deserialize_vocab @@ -2036,12 +2102,19 @@ def _apply_pipes( """ Underscore.load_state(underscore_state) while True: - texts = receiver.get() - docs = (make_doc(text) for text in texts) - for pipe in pipes: - docs = pipe(docs) - # Connection does not accept unpickable objects, so send list. - sender.send([doc.to_bytes() for doc in docs]) + try: + texts = receiver.get() + docs = (make_doc(text) for text in texts) + for pipe in pipes: + docs = pipe(docs) + # Connection does not accept unpickable objects, so send list. + byte_docs = [(doc.to_bytes(), None) for doc in docs] + padding = [(None, None)] * (len(texts) - len(byte_docs)) + sender.send(byte_docs + padding) + except Exception: + error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))] + padding = [(None, None)] * (len(texts) - 1) + sender.send(error_msg + padding) class _Sender: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index c8e0f2965..3564b6e42 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -163,7 +163,7 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: - """RETURNS (str): Sequential ID of the lexemes's lexical type, used + """RETURNS (str): Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id @@ -205,7 +205,7 @@ cdef class Lexeme: self.c.lower = x property norm: - """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the + """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): @@ -288,7 +288,7 @@ cdef class Lexeme: self.c.lower = self.vocab.strings.add(x) property norm_: - """RETURNS (str): The lexemes's norm, i.e. a normalised form of the + """RETURNS (str): The lexeme's norm, i.e. a normalised form of the lexeme text. """ def __get__(self): diff --git a/spacy/lookups.py b/spacy/lookups.py index 76535d1de..025afa04b 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, List, Union, Optional +from typing import Any, List, Union, Optional from pathlib import Path import srsly from preshed.bloom import BloomFilter @@ -12,18 +12,16 @@ from .strings import get_string_id UNSET = object() -def load_lookups( - lang: str, tables: List[str], strict: bool = True -) -> Optional[Dict[str, Any]]: +def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups": """Load the data from the spacy-lookups-data package for a given language, - if available. Returns an empty dict if there's no data or if the package + if available. Returns an empty `Lookups` container if there's no data or if the package is not installed. lang (str): The language code (corresponds to entry point exposed by the spacy-lookups-data package). tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"] strict (bool): Whether to raise an error if a table doesn't exist. - RETURNS (Dict[str, Any]): The lookups, keyed by table name. + RETURNS (Lookups): The lookups container containing the loaded tables. """ # TODO: import spacy_lookups_data instead of going via entry points here? lookups = Lookups() diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6fd8bdb03..7b1cfb633 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -290,7 +290,13 @@ cdef class Matcher: if on_match is not None: on_match(self, doc, i, final_matches) if as_spans: - return [Span(doc, start, end, label=key) for key, start, end in final_matches] + spans = [] + for key, start, end in final_matches: + if isinstance(doclike, Span): + start += doclike.start + end += doclike.start + spans.append(Span(doc, start, end, label=key)) + return spans elif with_alignments: # convert alignments List[Dict[str, int]] --> List[int] final_matches = [] diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index f5e5cd8ad..b99de2d2b 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -1,6 +1,9 @@ from thinc.api import Model, normal_init +from ..util import registry + +@registry.layers("spacy.PrecomputableAffine.v1") def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): model = Model( "precomputable_affine", diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index 7e1cce884..c1c2929fd 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -1,8 +1,10 @@ from thinc.api import Model +from ..util import registry from ..attrs import LOWER +@registry.layers("spacy.extract_ngrams.v1") def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: model = Model("extract_ngrams", forward) model.attrs["ngram_size"] = ngram_size diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py new file mode 100644 index 000000000..8afd1a3cc --- /dev/null +++ b/spacy/ml/extract_spans.py @@ -0,0 +1,60 @@ +from typing import Tuple, Callable +from thinc.api import Model, to_numpy +from thinc.types import Ragged, Ints1d + +from ..util import registry + + +@registry.layers("spacy.extract_spans.v1") +def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: + """Extract spans from a sequence of source arrays, as specified by an array + of (start, end) indices. The output is a ragged array of the + extracted spans. + """ + return Model( + "extract_spans", forward, layers=[], refs={}, attrs={}, dims={}, init=init + ) + + +def init(model, X=None, Y=None): + pass + + +def forward( + model: Model, source_spans: Tuple[Ragged, Ragged], is_train: bool +) -> Tuple[Ragged, Callable]: + """Get subsequences from source vectors.""" + ops = model.ops + X, spans = source_spans + assert spans.dataXd.ndim == 2 + indices = _get_span_indices(ops, spans, X.lengths) + Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) + x_shape = X.dataXd.shape + x_lengths = X.lengths + + def backprop_windows(dY: Ragged) -> Tuple[Ragged, Ragged]: + dX = Ragged(ops.alloc2f(*x_shape), x_lengths) + ops.scatter_add(dX.dataXd, indices, dY.dataXd) + return (dX, spans) + + return Y, backprop_windows + + +def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d: + """Construct a flat array that has the indices we want to extract from the + source data. For instance, if we want the spans (5, 9), (8, 10) the + indices will be [5, 6, 7, 8, 8, 9]. + """ + spans, lengths = _ensure_cpu(spans, lengths) + indices = [] + offset = 0 + for i, length in enumerate(lengths): + spans_i = spans[i].dataXd + offset + for j in range(spans_i.shape[0]): + indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) + offset += length + return ops.flatten(indices) + + +def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: + return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)) diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index f03237019..9b7628f0e 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,7 @@ from .entity_linker import * # noqa from .multi_task import * # noqa from .parser import * # noqa +from .spancat import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 21e1c53b9..645b67c62 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -6,12 +6,13 @@ from thinc.api import Model, Maxout, Linear from ...util import registry from ...kb import KnowledgeBase, Candidate, get_candidates from ...vocab import Vocab +from ...tokens import Span @registry.architectures("spacy.EntityLinker.v1") def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: with Model.define_operators({">>": chain, "**": clone}): - token_width = tok2vec.get_dim("nO") + token_width = tok2vec.maybe_get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) model = ( tok2vec @@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: @registry.misc("spacy.CandidateGenerator.v1") -def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: +def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index cbfa59eea..97bef2d0e 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode from thinc.api import MultiSoftmax, list2array from thinc.api import to_categorical, CosineDistance, L2Distance -from ...util import registry +from ...util import registry, OOV_RANK from ...errors import Errors from ...attrs import ID @@ -13,7 +13,7 @@ from functools import partial if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from ...vocab import Vocab # noqa: F401 - from ...tokens import Doc # noqa: F401 + from ...tokens.doc import Doc # noqa: F401 @registry.architectures("spacy.PretrainVectors.v1") @@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] + target[ids == OOV_RANK] = 0 d_target, loss = distance(prediction, target) return loss, d_target @@ -205,7 +206,7 @@ def _apply_mask( docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15 ) -> Tuple[numpy.ndarray, List["Doc"]]: # This needs to be here to avoid circular imports - from ...tokens import Doc # noqa: F811 + from ...tokens.doc import Doc # noqa: F811 N = sum(len(doc) for doc in docs) mask = numpy.random.uniform(0.0, 1.0, (N,)) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 861094209..80751a695 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -10,48 +10,7 @@ from ..tb_framework import TransitionModel from ...tokens import Doc -@registry.architectures("spacy.TransitionBasedParser.v1") -def transition_parser_v1( - tok2vec: Model[List[Doc], List[Floats2d]], - state_type: Literal["parser", "ner"], - extra_state_tokens: bool, - hidden_width: int, - maxout_pieces: int, - use_upper: bool = True, - nO: Optional[int] = None, -) -> Model: - return build_tb_parser_model( - tok2vec, - state_type, - extra_state_tokens, - hidden_width, - maxout_pieces, - use_upper, - nO, - ) - - @registry.architectures("spacy.TransitionBasedParser.v2") -def transition_parser_v2( - tok2vec: Model[List[Doc], List[Floats2d]], - state_type: Literal["parser", "ner"], - extra_state_tokens: bool, - hidden_width: int, - maxout_pieces: int, - use_upper: bool, - nO: Optional[int] = None, -) -> Model: - return build_tb_parser_model( - tok2vec, - state_type, - extra_state_tokens, - hidden_width, - maxout_pieces, - use_upper, - nO, - ) - - def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], state_type: Literal["parser", "ner"], diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py new file mode 100644 index 000000000..5c49fef40 --- /dev/null +++ b/spacy/ml/models/spancat.py @@ -0,0 +1,54 @@ +from typing import List, Tuple +from thinc.api import Model, with_getitem, chain, list2ragged, Logistic +from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init +from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last +from thinc.types import Ragged, Floats2d + +from ...util import registry +from ...tokens import Doc +from ..extract_spans import extract_spans + + +@registry.layers.register("spacy.LinearLogistic.v1") +def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]: + """An output layer for multi-label classification. It uses a linear layer + followed by a logistic activation. + """ + return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic()) + + +@registry.layers.register("spacy.mean_max_reducer.v1") +def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]: + """Reduce sequences by concatenating their mean and max pooled vectors, + and then combine the concatenated vectors with a hidden layer. + """ + return chain( + concatenate(reduce_last(), reduce_first(), reduce_mean(), reduce_max()), + Maxout(nO=hidden_size, normalize=True, dropout=0.0), + ) + + +@registry.architectures.register("spacy.SpanCategorizer.v1") +def build_spancat_model( + tok2vec: Model[List[Doc], List[Floats2d]], + reducer: Model[Ragged, Floats2d], + scorer: Model[Floats2d, Floats2d], +) -> Model[Tuple[List[Doc], Ragged], Floats2d]: + """Build a span categorizer model, given a token-to-vector model, a + reducer model to map the sequence of vectors for each span down to a single + vector, and a scorer model to map the vectors to probabilities. + + tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model. + reducer (Model[Ragged, Floats2d]): The reducer model. + scorer (Model[Floats2d, Floats2d]): The scorer model. + """ + model = chain( + with_getitem(0, chain(tok2vec, list2ragged())), + extract_spans(), + reducer, + scorer, + ) + model.set_ref("tok2vec", tok2vec) + model.set_ref("reducer", reducer) + model.set_ref("scorer", scorer) + return model diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a1855c5a0..e3f6e944a 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,11 +1,13 @@ +from functools import partial from typing import Optional, List from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum -from thinc.api import with_cpu, Relu, residual, LayerNorm +from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable from thinc.layers.chain import init as init_chain +from thinc.layers.resizable import resize_model, resize_linear_weighted from ...attrs import ORTH from ...util import registry @@ -15,7 +17,10 @@ from ...tokens import Doc from .tok2vec import get_tok2vec_width -@registry.architectures("spacy.TextCatCNN.v1") +NEG_VALUE = -5000 + + +@registry.architectures("spacy.TextCatCNN.v2") def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None ) -> Model[List[Doc], Floats2d]: @@ -25,38 +30,75 @@ def build_simple_cnn_text_classifier( outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ + fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): cnn = tok2vec >> list2ragged() >> reduce_mean() + nI = tok2vec.maybe_get_dim("nO") if exclusive_classes: - output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = cnn >> output_layer - model.set_ref("output_layer", output_layer) + output_layer = Softmax(nO=nO, nI=nI) + fill_defaults["b"] = NEG_VALUE + resizable_layer = resizable( + output_layer, + resize_layer=partial( + resize_linear_weighted, fill_defaults=fill_defaults + ), + ) + model = cnn >> resizable_layer else: - linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = cnn >> linear_layer >> Logistic() - model.set_ref("output_layer", linear_layer) + output_layer = Linear(nO=nO, nI=nI) + resizable_layer = resizable( + output_layer, + resize_layer=partial( + resize_linear_weighted, fill_defaults=fill_defaults + ), + ) + model = cnn >> resizable_layer >> Logistic() + model.set_ref("output_layer", output_layer) + model.attrs["resize_output"] = partial( + resize_and_set_ref, + resizable_layer=resizable_layer, + ) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) model.attrs["multi_label"] = not exclusive_classes return model -@registry.architectures("spacy.TextCatBOW.v1") +def resize_and_set_ref(model, new_nO, resizable_layer): + resizable_layer = resize_model(resizable_layer, new_nO) + model.set_ref("output_layer", resizable_layer.layers[0]) + model.set_dim("nO", new_nO, force=True) + return model + + +@registry.architectures("spacy.TextCatBOW.v2") def build_bow_text_classifier( exclusive_classes: bool, ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, ) -> Model[List[Doc], Floats2d]: + fill_defaults = {"b": 0, "W": 0} with Model.define_operators({">>": chain}): - sparse_linear = SparseLinear(nO) - model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear - model = with_cpu(model, model.ops) + sparse_linear = SparseLinear(nO=nO) + output_layer = None if not no_output_layer: + fill_defaults["b"] = NEG_VALUE output_layer = softmax_activation() if exclusive_classes else Logistic() + resizable_layer = resizable( + sparse_linear, + resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults), + ) + model = extract_ngrams(ngram_size, attr=ORTH) >> resizable_layer + model = with_cpu(model, model.ops) + if output_layer: model = model >> with_cpu(output_layer, output_layer.ops) + model.set_dim("nO", nO) model.set_ref("output_layer", sparse_linear) model.attrs["multi_label"] = not exclusive_classes + model.attrs["resize_output"] = partial( + resize_and_set_ref, resizable_layer=resizable_layer + ) return model @@ -69,9 +111,7 @@ def build_text_classifier_v2( exclusive_classes = not linear_model.attrs["multi_label"] with Model.define_operators({">>": chain, "|": concatenate}): width = tok2vec.maybe_get_dim("nO") - attention_layer = ParametricAttention( - width - ) # TODO: benchmark performance difference of this layer + attention_layer = ParametricAttention(width) maxout_layer = Maxout(nO=width, nI=width) norm_layer = LayerNorm(nI=width) cnn_model = ( diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 4ab5830cd..ab4a969e2 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,7 +1,9 @@ from thinc.api import Model, noop from .parser_model import ParserStepModel +from ..util import registry +@registry.layers("spacy.TransitionModel.v1") def TransitionModel( tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set() ): @@ -15,7 +17,7 @@ def TransitionModel( return Model( name="parser_model", forward=forward, - dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None}, + dims={"nI": tok2vec.maybe_get_dim("nI")}, layers=[tok2vec, lower, upper], refs={"tok2vec": tok2vec, "lower": lower, "upper": upper}, init=init, diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 1fa53a556..7b483724c 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -11,6 +11,7 @@ from .senter import SentenceRecognizer from .sentencizer import Sentencizer from .tagger import Tagger from .textcat import TextCategorizer +from .spancat import SpanCategorizer from .textcat_multilabel import MultiLabel_TextCategorizer from .tok2vec import Tok2Vec from .functions import merge_entities, merge_noun_chunks, merge_subtokens @@ -27,6 +28,7 @@ __all__ = [ "Pipe", "SentenceRecognizer", "Sentencizer", + "SpanCategorizer", "Tagger", "TextCategorizer", "Tok2Vec", diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index dd747c08e..3edeff19a 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -1,3 +1,5 @@ +import os +import random from libc.stdint cimport int32_t from cymem.cymem cimport Pool @@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam from ...tokens.doc cimport Doc from ...tokens.span import Span +from ...tokens.span cimport Span from ...typedefs cimport weight_t, attr_t from ...lexeme cimport Lexeme from ...attrs cimport IS_SPACE -from ...structs cimport TokenC +from ...structs cimport TokenC, SpanC from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC @@ -25,7 +28,6 @@ cdef enum: LAST UNIT OUT - ISNT N_MOVES @@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I' MOVE_NAMES[LAST] = 'L' MOVE_NAMES[UNIT] = 'U' MOVE_NAMES[OUT] = 'O' -MOVE_NAMES[ISNT] = 'x' cdef struct GoldNERStateC: Transition* ner + SpanC* negs int32_t length + int32_t nr_neg cdef class BiluoGold: cdef Pool mem cdef GoldNERStateC c - def __init__(self, BiluoPushDown moves, StateClass stcls, Example example): + def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key): self.mem = Pool() - self.c = create_gold_state(self.mem, moves, stcls.c, example) + self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key) def update(self, StateClass stcls): update_gold_state(&self.c, stcls.c) - cdef GoldNERStateC create_gold_state( Pool mem, BiluoPushDown moves, const StateC* stcls, - Example example + Example example, + neg_key ) except *: cdef GoldNERStateC gs + cdef Span neg + if neg_key is not None: + negs = example.get_aligned_spans_y2x( + example.y.spans.get(neg_key, []), + allow_overlap=True + ) + else: + negs = [] assert example.x.length > 0 gs.ner = mem.alloc(example.x.length, sizeof(Transition)) - ner_tags = example.get_aligned_ner() + gs.negs = mem.alloc(len(negs), sizeof(SpanC)) + gs.nr_neg = len(negs) + ner_ents, ner_tags = example.get_aligned_ents_and_ner() for i, ner_tag in enumerate(ner_tags): gs.ner[i] = moves.lookup_transition(ner_tag) + + # Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label. + neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs} + for pos_span in ner_ents: + if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples: + raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_))) + + # In order to handle negative samples, we need to maintain the full + # (start, end, label) triple. If we break it down to the 'isnt B-LOC' + # thing, we'll get blocked if there's an incorrect prefix. + for i, neg in enumerate(negs): + gs.negs[i] = neg.c return gs @@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem): cdef attr_t label if name == '-' or name == '' or name is None: return Transition(clas=0, move=MISSING, label=0, score=0) - elif name == '!O': - return Transition(clas=0, move=ISNT, label=0, score=0) elif '-' in name: move_str, label_str = name.split('-', 1) - # Hacky way to denote 'not this entity' + # Deprecated, hacky way to denote 'not this entity' if label_str.startswith('!'): - label_str = label_str[1:] - move_str = 'x' + raise ValueError(Errors.E869.format(label=name)) label = self.strings.add(label_str) else: move_str = name label = 0 move = MOVE_NAMES.index(move_str) - if move == ISNT: - return Transition(clas=0, move=ISNT, label=label, score=0) for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] @@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem): label_id = label_name if action == OUT and label_id != 0: return None - if action == MISSING or action == ISNT: + if action == MISSING: return None # Check we're not creating a move we already have, so that this is # idempotent @@ -247,7 +267,7 @@ cdef class BiluoPushDown(TransitionSystem): for i in range(state.c._ents.size()): ent = state.c._ents.at(i) if ent.start != -1 and ent.end != -1: - ents.append(Span(doc, ent.start, ent.end, label=ent.label)) + ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id)) doc.set_ents(ents, default="unmodified") # Set non-blocked tokens to O for i in range(doc.length): @@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem): return parses def init_gold(self, StateClass state, Example example): - return BiluoGold(self, state, example) + return BiluoGold(self, state, example, self.neg_key) def has_gold(self, Example eg, start=0, end=None): + # We get x and y referring to X, we want to check relative to Y, + # the reference + y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]]) + if not y_spans: + y_spans = [eg.y[:]] + y_span = y_spans[0] + start = y_span.start + end = y_span.end + neg_key = self.neg_key + if neg_key is not None: + # If we have any negative samples, count that as having annotation. + for span in eg.y.spans.get(neg_key, []): + if span.start >= start and span.end <= end: + return True for word in eg.y[start:end]: if word.ent_iob != 0: return True @@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem): n_gold += costs[i] <= 0 else: costs[i] = 9000 - if n_gold < 1: - raise ValueError cdef class Missing: @@ -373,23 +405,33 @@ cdef class Begin: @staticmethod cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: gold = _gold - cdef int g_act = gold.ner[s.B(0)].move - cdef attr_t g_tag = gold.ner[s.B(0)].label + b0 = s.B(0) + cdef int cost = 0 + cdef int g_act = gold.ner[b0].move + cdef attr_t g_tag = gold.ner[b0].label if g_act == MISSING: - return 0 + pass elif g_act == BEGIN: # B, Gold B --> Label match - return label != g_tag - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return label == g_tag + cost += label != g_tag else: # B, Gold I --> False (P) # B, Gold L --> False (P) # B, Gold O --> False (P) # B, Gold U --> False (P) - return 1 + cost += 1 + if s.buffer_length() < 3: + # Handle negatives. In general we can't really do much to block + # B, because we don't know whether the whole entity is going to + # be correct or not. However, we can at least tell whether we're + # going to be opening an entity where there's only one possible + # L. + for span in gold.negs[:gold.nr_neg]: + if span.label == label and span.start == b0: + cost += 1 + break + return cost cdef class In: @@ -462,9 +504,6 @@ cdef class In: elif g_act == UNIT: # I, Gold U --> True iff next tag == O return next_act != OUT - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return 0 else: return 1 @@ -504,32 +543,41 @@ cdef class Last: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: gold = _gold move = LAST + b0 = s.B(0) + ent_start = s.E(0) - cdef int g_act = gold.ner[s.B(0)].move - cdef attr_t g_tag = gold.ner[s.B(0)].label + cdef int g_act = gold.ner[b0].move + cdef attr_t g_tag = gold.ner[b0].label + + cdef int cost = 0 if g_act == MISSING: - return 0 + pass elif g_act == BEGIN: # L, Gold B --> True - return 0 + pass elif g_act == IN: # L, Gold I --> True iff this entity sunk - return not _entity_is_sunk(s, gold.ner) + cost += not _entity_is_sunk(s, gold.ner) elif g_act == LAST: # L, Gold L --> True - return 0 + pass elif g_act == OUT: # L, Gold O --> True - return 0 + pass elif g_act == UNIT: # L, Gold U --> True - return 0 - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return 0 + pass else: - return 1 + cost += 1 + # If we have negative-example entities, integrate them into the objective, + # by marking actions that close an entity that we know is incorrect + # as costly. + for span in gold.negs[:gold.nr_neg]: + if span.label == label and (span.end-1) == b0 and span.start == ent_start: + cost += 1 + break + return cost cdef class Unit: @@ -573,21 +621,29 @@ cdef class Unit: gold = _gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label + cdef int cost = 0 if g_act == MISSING: - return 0 + pass elif g_act == UNIT: # U, Gold U --> True iff tag match - return label != g_tag - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return label == g_tag + cost += label != g_tag else: # U, Gold B --> False # U, Gold I --> False # U, Gold L --> False # U, Gold O --> False - return 1 + cost += 1 + # If we have negative-example entities, integrate them into the objective. + # This is fairly straight-forward for U- entities, as we have a single + # action + cdef int b0 = s.B(0) + for span in gold.negs[:gold.nr_neg]: + if span.label == label and span.start == b0 and span.end == (b0+1): + cost += 1 + break + return cost + cdef class Out: @@ -613,25 +669,24 @@ cdef class Out: gold = _gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label - - if g_act == ISNT and g_tag == 0: - return 1 - elif g_act == MISSING or g_act == ISNT: - return 0 + cdef weight_t cost = 0 + if g_act == MISSING: + pass elif g_act == BEGIN: # O, Gold B --> False - return 1 + cost += 1 elif g_act == IN: # O, Gold I --> True - return 0 + pass elif g_act == LAST: # O, Gold L --> True - return 0 + pass elif g_act == OUT: # O, Gold O --> True - return 0 + pass elif g_act == UNIT: # O, Gold U --> False - return 1 + cost += 1 else: - return 1 + cost += 1 + return cost diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd index eed347b98..52ebd2b8e 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -41,6 +41,7 @@ cdef class TransitionSystem: cdef public attr_t root_label cdef public freqs cdef public object labels + cdef public object cfg cdef init_state_t init_beam_state cdef del_state_t del_beam_state diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 9e6f847eb..18eb745a9 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1: cdef class TransitionSystem: - def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None): + def __init__( + self, + StringStore string_table, + labels_by_action=None, + min_freq=None, + incorrect_spans_key=None + ): + self.cfg = {"neg_key": incorrect_spans_key} self.mem = Pool() self.strings = string_table self.n_moves = 0 @@ -49,8 +56,13 @@ cdef class TransitionSystem: self.del_beam_state = _del_state def __reduce__(self): + # TODO: This loses the 'cfg' return (self.__class__, (self.strings, self.labels), None, None) + @property + def neg_key(self): + return self.cfg.get("neg_key") + def init_batch(self, docs): cdef StateClass state states = [] @@ -220,16 +232,21 @@ cdef class TransitionSystem: transitions = [] serializers = { 'moves': lambda: srsly.json_dumps(self.labels), - 'strings': lambda: self.strings.to_bytes() + 'strings': lambda: self.strings.to_bytes(), + 'cfg': lambda: self.cfg } return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, exclude=tuple()): + # We're adding a new field, 'cfg', here and we don't want to break + # previous models that don't have it. + msg = srsly.msgpack_loads(bytes_data) labels = {} - deserializers = { - 'moves': lambda b: labels.update(srsly.json_loads(b)), - 'strings': lambda b: self.strings.from_bytes(b) - } - msg = util.from_bytes(bytes_data, deserializers, exclude) + if 'moves' not in exclude: + labels.update(srsly.json_loads(msg['moves'])) + if 'strings' not in exclude: + self.strings.from_bytes(msg['strings']) + if 'cfg' not in exclude and 'cfg' in msg: + self.cfg.update(msg['cfg']) self.initialize_actions(labels) return self diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 79ec9c993..a6efd5906 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -106,7 +106,7 @@ class AttributeRuler(Pipe): def match(self, doc: Doc): matches = self.matcher(doc, allow_missing=True) - # Sort by the attribute ID, so that later rules have precendence + # Sort by the attribute ID, so that later rules have precedence matches = [ (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches ] diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 37f09ce3a..be23ab0dd 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config +from ._parser_internals.transition_system import TransitionSystem from .transition_parser cimport Parser from ._parser_internals.arc_eager cimport ArcEager @@ -59,7 +60,7 @@ def make_parser( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int @@ -85,13 +86,13 @@ def make_parser( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (List[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. @@ -112,6 +113,9 @@ def make_parser( beam_width=1, beam_density=0.0, beam_update_prob=0.0, + # At some point in the future we can try to implement support for + # partial annotations, perhaps only in the beam objective. + incorrect_spans_key=None ) @Language.factory( @@ -140,7 +144,7 @@ def make_beam_parser( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, @@ -165,8 +169,13 @@ def make_beam_parser( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (List[str]): A list of transition names. Inferred from the data if not - provided. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring @@ -195,7 +204,10 @@ def make_beam_parser( beam_update_prob=beam_update_prob, multitasks=[], learn_tokens=learn_tokens, - min_action_freq=min_action_freq + min_action_freq=min_action_freq, + # At some point in the future we can try to implement support for + # partial annotations, perhaps only in the beam objective. + incorrect_spans_key=None ) @@ -206,6 +218,39 @@ cdef class DependencyParser(Parser): """ TransitionSystem = ArcEager + def __init__( + self, + vocab, + model, + name="parser", + moves=None, + *, + update_with_oracle_cut_size=100, + min_action_freq=30, + learn_tokens=False, + beam_width=1, + beam_density=0.0, + beam_update_prob=0.0, + multitasks=tuple(), + incorrect_spans_key=None, + ): + """Create a DependencyParser. + """ + super().__init__( + vocab, + model, + name, + moves, + update_with_oracle_cut_size=update_with_oracle_cut_size, + min_action_freq=min_action_freq, + learn_tokens=learn_tokens, + beam_width=beam_width, + beam_density=beam_density, + beam_update_prob=beam_update_prob, + multitasks=multitasks, + incorrect_spans_key=incorrect_spans_key, + ) + @property def postprocesses(self): output = [nonproj.deprojectivize] diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 21d5e9db1..ba7e71f15 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -9,7 +9,7 @@ import warnings from ..kb import KnowledgeBase, Candidate from ..ml import empty_kb -from ..tokens import Doc +from ..tokens import Doc, Span from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language @@ -67,7 +67,7 @@ def make_entity_linker( incl_prior: bool, incl_context: bool, entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], + get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], ): """Construct an EntityLinker component. @@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe): incl_prior: bool, incl_context: bool, entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], + get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], ) -> None: """Initialize an entity linker. @@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe): incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that + get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. DOCS: https://spacy.io/api/entitylinker#init @@ -142,7 +142,7 @@ class EntityLinker(TrainablePipe): self.get_candidates = get_candidates self.cfg = {} self.distance = CosineDistance(normalize=False) - # how many neightbour sentences to take into account + # how many neighbour sentences to take into account # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) @@ -156,6 +156,8 @@ class EntityLinker(TrainablePipe): def validate_kb(self) -> None: # Raise an error if the knowledge base is not initialized. + if self.kb is None: + raise ValueError(Errors.E1018.format(name=self.name)) if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) @@ -305,11 +307,9 @@ class EntityLinker(TrainablePipe): sent = ent.sent sent_index = sentences.index(sent) assert sent_index >= 0 - # get n_neightbour sentences, clipped to the length of the document + # get n_neighbour sentences, clipped to the length of the document start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents - ) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() @@ -335,22 +335,16 @@ class EntityLinker(TrainablePipe): else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray( - [c.prior_prob for c in candidates] - ) + prior_probs = xp.asarray([c.prior_prob for c in candidates]) if not self.incl_prior: - prior_probs = xp.asarray( - [0.0 for _ in candidates] - ) + prior_probs = xp.asarray([0.0 for _ in candidates]) scores = prior_probs # add in similarity from the context if self.incl_context: entity_encodings = xp.asarray( [c.entity_vector for c in candidates] ) - entity_norm = xp.linalg.norm( - entity_encodings, axis=1 - ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) if len(entity_encodings) != len(prior_probs): raise RuntimeError( Errors.E147.format( @@ -359,14 +353,12 @@ class EntityLinker(TrainablePipe): ) ) # cosine similarity - sims = xp.dot( - entity_encodings, sentence_encoding_t - ) / (sentence_norm * entity_norm) + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm + ) if sims.shape != prior_probs.shape: raise ValueError(Errors.E161) - scores = ( - prior_probs + sims - (prior_probs * sims) - ) + scores = prior_probs + sims - (prior_probs * sims) # TODO: thresholding best_index = scores.argmax().item() best_candidate = candidates[best_index] @@ -408,6 +400,48 @@ class EntityLinker(TrainablePipe): validate_examples(examples, "EntityLinker.score") return Scorer.score_links(examples, negative_labels=[self.NIL]) + def to_bytes(self, *, exclude=tuple()): + """Serialize the pipe to a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + + DOCS: https://spacy.io/api/entitylinker#to_bytes + """ + self._validate_serialization_attrs() + serialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["vocab"] = self.vocab.to_bytes + serialize["kb"] = self.kb.to_bytes + serialize["model"] = self.model.to_bytes + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load the pipe from a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (TrainablePipe): The loaded object. + + DOCS: https://spacy.io/api/entitylinker#from_bytes + """ + self._validate_serialization_attrs() + + def load_model(b): + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["kb"] = lambda b: self.kb.from_bytes(b) + deserialize["model"] = load_model + util.from_bytes(bytes_data, deserialize, exclude) + return self + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 78269f180..1dea8fba0 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -141,7 +141,9 @@ class EntityRuler(Pipe): def match(self, doc: Doc): self._require_patterns() - matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W036") + matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] ) @@ -275,9 +277,7 @@ class EntityRuler(Pipe): if self == pipe: current_index = i break - subsequent_pipes = [ - pipe for pipe in self.nlp.pipe_names[current_index + 1 :] - ] + subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]] except ValueError: subsequent_pipes = [] with self.nlp.select_pipes(disable=subsequent_pipes): @@ -298,7 +298,7 @@ class EntityRuler(Pipe): self.nlp.pipe(phrase_pattern_texts), phrase_pattern_ids, ): - phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id} + phrase_pattern = {"label": label, "pattern": pattern} if ent_id: phrase_pattern["id"] = ent_id phrase_patterns.append(phrase_pattern) diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 0b9b0d324..f4ae4b787 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config +from ._parser_internals.transition_system import TransitionSystem from .transition_parser cimport Parser from ._parser_internals.ner cimport BiluoPushDown @@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "moves": None, "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, + "incorrect_spans_key": None }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, @@ -48,8 +50,9 @@ def make_ner( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, + incorrect_spans_key: Optional[str]=None ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -67,13 +70,16 @@ def make_ner( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (list[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. + incorrect_spans_key (Optional[str]): Identifies spans that are known + to be incorrect entity annotations. The incorrect entity annotations + can be stored in the span group, under this key. """ return EntityRecognizer( nlp.vocab, @@ -81,9 +87,8 @@ def make_ner( name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, + incorrect_spans_key=incorrect_spans_key, multitasks=[], - min_action_freq=1, - learn_tokens=False, beam_width=1, beam_density=0.0, beam_update_prob=0.0, @@ -98,7 +103,8 @@ def make_ner( "model": DEFAULT_NER_MODEL, "beam_density": 0.01, "beam_update_prob": 0.5, - "beam_width": 32 + "beam_width": 32, + "incorrect_spans_key": None }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) @@ -106,11 +112,12 @@ def make_beam_ner( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, beam_width: int, beam_density: float, beam_update_prob: float, + incorrect_spans_key: Optional[str]=None ): """Create a transition-based EntityRecognizer component that uses beam-search. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -128,13 +135,13 @@ def make_beam_ner( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (list[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring @@ -144,6 +151,8 @@ def make_beam_ner( beam_update_prob (float): The chance of making a beam update, instead of a greedy update. Greedy updates are an approximation for the beam updates, and are faster to compute. + incorrect_spans_key (Optional[str]): Optional key into span groups of + entities known to be non-entities. """ return EntityRecognizer( nlp.vocab, @@ -152,11 +161,10 @@ def make_beam_ner( moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, multitasks=[], - min_action_freq=1, - learn_tokens=False, beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, + incorrect_spans_key=incorrect_spans_key ) @@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser): """ TransitionSystem = BiluoPushDown + def __init__( + self, + vocab, + model, + name="ner", + moves=None, + *, + update_with_oracle_cut_size=100, + beam_width=1, + beam_density=0.0, + beam_update_prob=0.0, + multitasks=tuple(), + incorrect_spans_key=None, + ): + """Create an EntityRecognizer. + """ + super().__init__( + vocab, + model, + name, + moves, + update_with_oracle_cut_size=update_with_oracle_cut_size, + min_action_freq=1, # not relevant for NER + learn_tokens=False, # not relevant for NER + beam_width=beam_width, + beam_density=beam_density, + beam_update_prob=beam_update_prob, + multitasks=multitasks, + incorrect_spans_key=incorrect_spans_key, + ) + def add_multitask_objective(self, mt_component): """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py new file mode 100644 index 000000000..8d1be06c3 --- /dev/null +++ b/spacy/pipeline/spancat.py @@ -0,0 +1,423 @@ +import numpy +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any +from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops +from thinc.api import Optimizer +from thinc.types import Ragged, Ints2d, Floats2d + +from ..scorer import Scorer +from ..language import Language +from .trainable_pipe import TrainablePipe +from ..tokens import Doc, SpanGroup, Span +from ..vocab import Vocab +from ..training import Example, validate_examples +from ..errors import Errors +from ..util import registry + + +spancat_default_config = """ +[model] +@architectures = "spacy.SpanCategorizer.v1" +scorer = {"@layers": "spacy.LinearLogistic.v1"} + +[model.reducer] +@layers = spacy.mean_max_reducer.v1 +hidden_size = 128 + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 96 +rows = [5000, 2000, 1000, 1000] +attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 4 +""" + +DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"] + + +@registry.misc("spacy.ngram_suggester.v1") +def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]: + """Suggest all spans of the given lengths. Spans are returned as a ragged + array of integers. The array has two columns, indicating the start and end + position.""" + + def ngram_suggester(docs: List[Doc], *, ops: Optional[Ops] = None) -> Ragged: + if ops is None: + ops = get_current_ops() + spans = [] + lengths = [] + for doc in docs: + starts = ops.xp.arange(len(doc), dtype="i") + starts = starts.reshape((-1, 1)) + length = 0 + for size in sizes: + if size <= len(doc): + starts_size = starts[: len(doc) - (size - 1)] + spans.append(ops.xp.hstack((starts_size, starts_size + size))) + length += spans[-1].shape[0] + if spans: + assert spans[-1].ndim == 2, spans[-1].shape + lengths.append(length) + if len(spans) > 0: + output = Ragged(ops.xp.vstack(spans), ops.asarray(lengths, dtype="i")) + else: + output = Ragged(ops.xp.zeros((0, 0)), ops.asarray(lengths, dtype="i")) + + assert output.dataXd.ndim == 2 + return output + + return ngram_suggester + + +@registry.misc("spacy.ngram_range_suggester.v1") +def build_ngram_range_suggester( + min_size: int, max_size: int +) -> Callable[[List[Doc]], Ragged]: + """Suggest all spans of the given lengths between a given min and max value - both inclusive. + Spans are returned as a ragged array of integers. The array has two columns, + indicating the start and end position.""" + sizes = range(min_size, max_size + 1) + return build_ngram_suggester(sizes) + + +@Language.factory( + "spancat", + assigns=["doc.spans"], + default_config={ + "threshold": 0.5, + "spans_key": "sc", + "max_positive": None, + "model": DEFAULT_SPANCAT_MODEL, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, +) +def make_spancat( + nlp: Language, + name: str, + suggester: Callable[[List[Doc]], Ragged], + model: Model[Tuple[List[Doc], Ragged], Floats2d], + spans_key: str, + threshold: float = 0.5, + max_positive: Optional[int] = None, +) -> "SpanCategorizer": + """Create a SpanCategorizer component. The span categorizer consists of two + parts: a suggester function that proposes candidate spans, and a labeller + model that predicts one or more labels for each span. + + suggester (Callable[List[Doc], Ragged]): A function that suggests spans. + Spans are returned as a ragged array with two integer columns, for the + start and end positions. + model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that + is given a list of documents and (start, end) indices representing + candidate span offsets. The model predicts a probability for each category + for each span. + spans_key (str): Key of the doc.spans dict to save the spans under. During + initialization and training, the component will look for spans on the + reference document under the same key. + threshold (float): Minimum probability to consider a prediction positive. + Spans with a positive prediction will be saved on the Doc. Defaults to + 0.5. + max_positive (Optional[int]): Maximum number of labels to consider positive + per span. Defaults to None, indicating no limit. + """ + return SpanCategorizer( + nlp.vocab, + suggester=suggester, + model=model, + spans_key=spans_key, + threshold=threshold, + max_positive=max_positive, + name=name, + ) + + +class SpanCategorizer(TrainablePipe): + """Pipeline component to label spans of text. + + DOCS: https://spacy.io/api/spancategorizer + """ + + def __init__( + self, + vocab: Vocab, + model: Model[Tuple[List[Doc], Ragged], Floats2d], + suggester: Callable[[List[Doc]], Ragged], + name: str = "spancat", + *, + spans_key: str = "spans", + threshold: float = 0.5, + max_positive: Optional[int] = None, + ) -> None: + """Initialize the span categorizer. + + DOCS: https://spacy.io/api/spancategorizer#init + """ + self.cfg = { + "labels": [], + "spans_key": spans_key, + "threshold": threshold, + "max_positive": max_positive, + } + self.vocab = vocab + self.suggester = suggester + self.model = model + self.name = name + + @property + def key(self) -> str: + """Key of the doc.spans dict to save the spans under. During + initialization and training, the component will look for spans on the + reference document under the same key. + """ + return self.cfg["spans_key"] + + def add_label(self, label: str) -> int: + """Add a new label to the pipe. + + label (str): The label to add. + RETURNS (int): 0 if label is already present, otherwise 1. + + DOCS: https://spacy.io/api/spancategorizer#add_label + """ + if not isinstance(label, str): + raise ValueError(Errors.E187) + if label in self.labels: + return 0 + self._allow_extra_label() + self.cfg["labels"].append(label) + self.vocab.strings.add(label) + return 1 + + @property + def labels(self) -> Tuple[str]: + """RETURNS (Tuple[str]): The labels currently added to the component. + + DOCS: https://spacy.io/api/spancategorizer#labels + """ + return tuple(self.cfg["labels"]) + + @property + def label_data(self) -> List[str]: + """RETURNS (List[str]): Information about the component's labels. + + DOCS: https://spacy.io/api/spancategorizer#label_data + """ + return list(self.labels) + + def predict(self, docs: Iterable[Doc]): + """Apply the pipeline's model to a batch of docs, without modifying them. + + docs (Iterable[Doc]): The documents to predict. + RETURNS: The models prediction for each document. + + DOCS: https://spacy.io/api/spancategorizer#predict + """ + indices = self.suggester(docs, ops=self.model.ops) + scores = self.model.predict((docs, indices)) + return (indices, scores) + + def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: + """Modify a batch of Doc objects, using pre-computed scores. + + docs (Iterable[Doc]): The documents to modify. + scores: The scores to set, produced by SpanCategorizer.predict. + + DOCS: https://spacy.io/api/spancategorizer#set_annotations + """ + labels = self.labels + indices, scores = indices_scores + offset = 0 + for i, doc in enumerate(docs): + indices_i = indices[i].dataXd + doc.spans[self.key] = self._make_span_group( + doc, indices_i, scores[offset : offset + indices.lengths[i]], labels + ) + offset += indices.lengths[i] + + def update( + self, + examples: Iterable[Example], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Learn from a batch of documents and gold-standard information, + updating the pipe's model. Delegates to predict and get_loss. + + examples (Iterable[Example]): A batch of Example objects. + drop (float): The dropout rate. + sgd (thinc.api.Optimizer): The optimizer. + losses (Dict[str, float]): Optional record of the loss during training. + Updated using the component name as the key. + RETURNS (Dict[str, float]): The updated losses dictionary. + + DOCS: https://spacy.io/api/spancategorizer#update + """ + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + validate_examples(examples, "SpanCategorizer.update") + self._validate_categories(examples) + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return losses + docs = [eg.predicted for eg in examples] + spans = self.suggester(docs, ops=self.model.ops) + if spans.lengths.sum() == 0: + return losses + set_dropout_rate(self.model, drop) + scores, backprop_scores = self.model.begin_update((docs, spans)) + loss, d_scores = self.get_loss(examples, (spans, scores)) + backprop_scores(d_scores) + if sgd is not None: + self.finish_update(sgd) + losses[self.name] += loss + return losses + + def get_loss( + self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Ragged] + ) -> Tuple[float, float]: + """Find the loss and gradient of loss for the batch of documents and + their predicted scores. + + examples (Iterable[Examples]): The batch of examples. + spans_scores: Scores representing the model's predictions. + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://spacy.io/api/spancategorizer#get_loss + """ + spans, scores = spans_scores + spans = Ragged( + self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths) + ) + label_map = {label: i for i, label in enumerate(self.labels)} + target = numpy.zeros(scores.shape, dtype=scores.dtype) + offset = 0 + for i, eg in enumerate(examples): + # Map (start, end) offset of spans to the row in the d_scores array, + # so that we can adjust the gradient for predictions that were + # in the gold standard. + spans_index = {} + spans_i = spans[i].dataXd + for j in range(spans.lengths[i]): + start = int(spans_i[j, 0]) + end = int(spans_i[j, 1]) + spans_index[(start, end)] = offset + j + for gold_span in self._get_aligned_spans(eg): + key = (gold_span.start, gold_span.end) + if key in spans_index: + row = spans_index[key] + k = label_map[gold_span.label_] + target[row, k] = 1.0 + # The target is a flat array for all docs. Track the position + # we're at within the flat array. + offset += spans.lengths[i] + target = self.model.ops.asarray(target, dtype="f") + # The target will have the values 0 (for untrue predictions) or 1 + # (for true predictions). + # The scores should be in the range [0, 1]. + # If the prediction is 0.9 and it's true, the gradient + # will be -0.1 (0.9 - 1.0). + # If the prediction is 0.9 and it's false, the gradient will be + # 0.9 (0.9 - 0.0) + d_scores = scores - target + loss = float((d_scores ** 2).sum()) + return loss, d_scores + + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Language = None, + labels: Optional[List[str]] = None, + ) -> None: + """Initialize the pipe for training, using a representative set + of data examples. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + labels: The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. + + DOCS: https://spacy.io/api/spancategorizer#initialize + """ + subbatch = [] + if labels is not None: + for label in labels: + self.add_label(label) + for eg in get_examples(): + if labels is None: + for span in eg.reference.spans.get(self.key, []): + self.add_label(span.label_) + if len(subbatch) < 10: + subbatch.append(eg) + self._require_labels() + if subbatch: + docs = [eg.x for eg in subbatch] + spans = self.suggester(docs) + Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) + self.model.initialize(X=(docs, spans), Y=Y) + else: + self.model.initialize() + + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. + + DOCS: https://spacy.io/api/spancategorizer#score + """ + validate_examples(examples, "SpanCategorizer.score") + self._validate_categories(examples) + kwargs = dict(kwargs) + attr_prefix = "spans_" + kwargs.setdefault("attr", f"{attr_prefix}{self.key}") + kwargs.setdefault("labels", self.labels) + kwargs.setdefault("multi_label", True) + kwargs.setdefault("threshold", self.cfg["threshold"]) + kwargs.setdefault( + "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) + ) + kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans) + return Scorer.score_spans(examples, **kwargs) + + def _validate_categories(self, examples): + # TODO + pass + + def _get_aligned_spans(self, eg: Example): + return eg.get_aligned_spans_y2x(eg.reference.spans.get(self.key, [])) + + def _make_span_group( + self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str] + ) -> SpanGroup: + spans = SpanGroup(doc, name=self.key) + max_positive = self.cfg["max_positive"] + threshold = self.cfg["threshold"] + for i in range(indices.shape[0]): + start = int(indices[i, 0]) + end = int(indices[i, 1]) + positives = [] + for j, score in enumerate(scores[i]): + if score >= threshold: + positives.append((score, start, end, labels[j])) + positives.sort(reverse=True) + if max_positive: + positives = positives[:max_positive] + for score, start, end, label in positives: + spans.append(Span(doc, start, end, label=label)) + return spans diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 938131f6f..fa260bdd6 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -222,7 +222,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix="!") # Convert empty tag "" to missing value None so that both misaligned # tokens and tokens with missing annotation have the default missing # value None. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 1d652a483..0dde5de82 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -35,7 +35,7 @@ maxout_pieces = 3 depth = 2 [model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -44,7 +44,7 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m single_label_bow_config = """ [model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -52,7 +52,7 @@ no_output_layer = false single_label_cnn_config = """ [model] -@architectures = "spacy.TextCatCNN.v1" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = true [model.tok2vec] @@ -298,6 +298,10 @@ class TextCategorizer(TrainablePipe): return 0 self._allow_extra_label() self.cfg["labels"].append(label) + if self.model and "resize_output" in self.model.attrs: + self.model = self.model.attrs["resize_output"]( + self.model, len(self.cfg["labels"]) + ) self.vocab.strings.add(label) return 1 @@ -332,6 +336,8 @@ class TextCategorizer(TrainablePipe): else: for label in labels: self.add_label(label) + if len(self.labels) < 2: + raise ValueError(Errors.E867) if positive_label is not None: if positive_label not in self.labels: err = Errors.E920.format(pos_label=positive_label, labels=self.labels) diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 7267735b4..ba36881af 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -35,7 +35,7 @@ maxout_pieces = 3 depth = 2 [model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -44,7 +44,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod multi_label_bow_config = """ [model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = false ngram_size = 1 no_output_layer = false @@ -52,7 +52,7 @@ no_output_layer = false multi_label_cnn_config = """ [model] -@architectures = "spacy.TextCatCNN.v1" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = false [model.tok2vec] diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 3ee324d50..00d9548a4 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -173,6 +173,7 @@ class Tok2Vec(TrainablePipe): for i in range(len(one_d_tokvecs)): d_tokvecs[i] += one_d_tokvecs[i] losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] def backprop(one_d_tokvecs): """Callback to actually do the backprop. Passed to last listener.""" diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index fe51f38e5..ce1e133a2 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -101,7 +101,8 @@ cdef class TrainablePipe(Pipe): def update(self, examples: Iterable["Example"], - *, drop: float=0.0, + *, + drop: float=0.0, sgd: Optimizer=None, losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: """Learn from a batch of documents and gold-standard information, @@ -213,7 +214,12 @@ cdef class TrainablePipe(Pipe): def _allow_extra_label(self) -> None: """Raise an error if the component can not add any more labels.""" - if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): + nO = None + if self.model.has_dim("nO"): + nO = self.model.get_dim("nO") + elif self.model.has_ref("output_layer") and self.model.get_ref("output_layer").has_dim("nO"): + nO = self.model.get_ref("output_layer").get_dim("nO") + if nO is not None and nO == len(self.labels): if not self.is_resizable: raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 4de57d311..a495b1bc7 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -29,6 +29,7 @@ from ..training import validate_examples, validate_get_examples from ..errors import Errors, Warnings from .. import util + cdef class Parser(TrainablePipe): """ Base class of the DependencyParser and EntityRecognizer. @@ -48,15 +49,43 @@ cdef class Parser(TrainablePipe): beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), + incorrect_spans_key=None ): """Create a Parser. vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. The value is set to the `.vocab` attribute. - **cfg: Configuration parameters. Set to the `.cfg` attribute. - If it doesn't include a value for 'moves', a new instance is - created with `self.TransitionSystem()`. This defines how the - parse-state is created, updated and evaluated. + model (Model): The model for the transition-based parser. The model needs + to have a specific substructure of named components --- see the + spacy.ml.tb_framework.TransitionModel for details. + name (str): The name of the pipeline component + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. + min_action_freq (int): The minimum frequency of labelled actions to retain. + Rarer labelled actions have their label backed-off to "dep". While this + primarily affects the label accuracy, it can also affect the attachment + structure, as the labels are used to represent the pseudo-projectivity + transformation. + learn_tokens (bool): Whether to learn to merge subtokens that are split + relative to the gold standard. Experimental. + beam_width (int): The number of candidate analyses to maintain. + beam_density (float): The minimum ratio between the scores of the first and + last candidates in the beam. This allows the parser to avoid exploring + candidates that are too far behind. This is mostly intended to improve + efficiency, but it can also improve accuracy as deeper search is not + always better. + beam_update_prob (float): The chance of making a beam update, instead of a + greedy update. Greedy updates are an approximation for the beam updates, + and are faster to compute. + multitasks: additional multi-tasking components. Experimental. + incorrect_spans_key (Optional[str]): Identifies spans that are known + to be incorrect entity annotations. The incorrect entity annotations + can be stored in the span group, under this key. """ self.vocab = vocab self.name = name @@ -68,11 +97,16 @@ cdef class Parser(TrainablePipe): "learn_tokens": learn_tokens, "beam_width": beam_width, "beam_density": beam_density, - "beam_update_prob": beam_update_prob + "beam_update_prob": beam_update_prob, + "incorrect_spans_key": incorrect_spans_key } if moves is None: - # defined by EntityRecognizer as a BiluoPushDown - moves = self.TransitionSystem(self.vocab.strings) + # EntityRecognizer -> BiluoPushDown + # DependencyParser -> ArcEager + moves = self.TransitionSystem( + self.vocab.strings, + incorrect_spans_key=incorrect_spans_key + ) self.moves = moves self.model = model if self.moves.n_moves != 0: @@ -118,6 +152,10 @@ cdef class Parser(TrainablePipe): # Available for subclasses, e.g. to deprojectivize return [] + @property + def incorrect_spans_key(self): + return self.cfg["incorrect_spans_key"] + def add_label(self, label): resized = False for action in self.moves.action_types: @@ -326,7 +364,6 @@ cdef class Parser(TrainablePipe): ) for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) - n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) if n_examples == 0: return losses @@ -554,7 +591,7 @@ cdef class Parser(TrainablePipe): self._resize() self.model.from_bytes(bytes_data) except AttributeError: - raise ValueError(Errors.E149) from None + raise ValueError(Errors.E149) return self def to_bytes(self, exclude=tuple()): diff --git a/spacy/schemas.py b/spacy/schemas.py index 92315399d..992e17d70 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -4,7 +4,7 @@ from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic.main import ModelMetaclass -from thinc.api import Optimizer, ConfigValidationError +from thinc.api import Optimizer, ConfigValidationError, Model from thinc.config import Promise from collections import defaultdict import inspect @@ -17,6 +17,7 @@ if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .language import Language # noqa: F401 from .training import Example # noqa: F401 + from .vocab import Vocab # noqa: F401 # fmt: off @@ -354,7 +355,7 @@ class ConfigSchemaPretrain(BaseModel): batcher: Batcher = Field(..., title="Batcher for the training data") component: str = Field(..., title="Component to find the layer to pretrain") layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") - objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.") + objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.") # fmt: on class Config: diff --git a/spacy/scorer.py b/spacy/scorer.py index 25df44f14..f4ccb2269 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -365,7 +365,9 @@ class Scorer: gold_spans.add(gold_span) gold_per_type[span.label_].add(gold_span) pred_per_type = {label: set() for label in labels} - for span in example.get_aligned_spans_x2y(getter(pred_doc, attr), allow_overlap): + for span in example.get_aligned_spans_x2y( + getter(pred_doc, attr), allow_overlap + ): if labeled: pred_span = (span.label_, span.start, span.end - 1) else: @@ -381,10 +383,10 @@ class Scorer: score.score_set(pred_spans, gold_spans) # Assemble final result final_scores = { - f"{attr}_p": None, - f"{attr}_r": None, - f"{attr}_f": None, - } + f"{attr}_p": None, + f"{attr}_r": None, + f"{attr}_f": None, + } if labeled: final_scores[f"{attr}_per_type"] = None if len(score) > 0: @@ -392,7 +394,9 @@ class Scorer: final_scores[f"{attr}_r"] = score.recall final_scores[f"{attr}_f"] = score.fscore if labeled: - final_scores[f"{attr}_per_type"] = {k: v.to_dict() for k, v in score_per_type.items()} + final_scores[f"{attr}_per_type"] = { + k: v.to_dict() for k, v in score_per_type.items() + } return final_scores @staticmethod diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 404783197..a5dedcc87 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -125,6 +125,11 @@ def ga_tokenizer(): return get_lang_class("ga")().tokenizer +@pytest.fixture(scope="session") +def grc_tokenizer(): + return get_lang_class("grc")().tokenizer + + @pytest.fixture(scope="session") def gu_tokenizer(): return get_lang_class("gu")().tokenizer @@ -202,6 +207,11 @@ def ne_tokenizer(): return get_lang_class("ne")().tokenizer +@pytest.fixture(scope="session") +def nl_vocab(): + return get_lang_class("nl")().vocab + + @pytest.fixture(scope="session") def nl_tokenizer(): return get_lang_class("nl")().tokenizer @@ -281,6 +291,13 @@ def uk_tokenizer(): return get_lang_class("uk")().tokenizer +@pytest.fixture +def uk_lemmatizer(): + pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2_dicts_uk") + return get_lang_class("uk")().add_pipe("lemmatizer") + + @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur")().tokenizer diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index fa0206fdd..231b7c2a8 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -18,14 +18,9 @@ def _ner_example(ner): def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(en_vocab, model, **config) + ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) @@ -40,14 +35,9 @@ def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(en_vocab, model, **config) + ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 0dc6c4866..6989b965f 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab): words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] text = " 'dogs'\n\nrun " (words, spaces) = util.get_words_and_spaces(words + ["away"], text) + + +def test_create_with_heads_and_no_deps(vocab): + words = "I like ginger".split() + heads = list(range(len(words))) + with pytest.raises(ValueError): + Doc(vocab, words=words, heads=heads) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 358724517..57df87642 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -346,17 +346,25 @@ def test_doc_from_array_morph(en_vocab): @pytest.mark.usefixtures("clean_underscore") def test_doc_api_from_docs(en_tokenizer, de_tokenizer): - en_texts = ["Merging the docs is fun.", "", "They don't think alike."] + en_texts = [ + "Merging the docs is fun.", + "", + "They don't think alike. ", + "Another doc.", + ] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) + en_docs[3].spans["group"] = [en_docs[3][0:1]] + span_group_texts = sorted( + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) - en_docs[0][2]._.is_ambiguous = True # docs - en_docs[2][3]._.is_ambiguous = True # think + en_docs[0][2]._.is_ambiguous = True # docs + en_docs[2][3]._.is_ambiguous = True # think assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) @@ -366,24 +374,25 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts_without_empty) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") - assert m_doc[2]._.is_ambiguous == True + assert m_doc[2]._.is_ambiguous is True assert m_doc[9].idx == think_idx - assert m_doc[9]._.is_ambiguous == True + assert m_doc[9]._.is_ambiguous is True assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) == sum(len(t) for t in en_texts) - assert str(m_doc) == "".join(en_texts) + assert len(m_doc.text) == sum(len(t) for t in en_texts) + assert m_doc.text == "".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert m_doc[9].idx == think_idx assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing - assert str(m_doc) == " ".join(en_texts_without_empty) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -406,6 +416,19 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + # can merge empty docs + doc = Doc.from_docs([en_tokenizer("")] * 10) + + # empty but set spans keys are preserved + en_docs = [en_tokenizer(text) for text in en_texts] + m_doc = Doc.from_docs(en_docs) + assert "group" not in m_doc.spans + for doc in en_docs: + doc.spans["group"] = [] + m_doc = Doc.from_docs(en_docs) + assert "group" in m_doc.spans + assert len(m_doc.spans["group"]) == 0 + def test_doc_api_from_docs_ents(en_tokenizer): texts = ["Merging the docs is fun.", "They don't think alike."] diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 36fa3c15d..20c302da1 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): words = ["The", "players", "start", "."] lemmas = [t.lower() for t in words] heads = [1, 2, 2, 2] + deps = ["dep"] * len(heads) tags = ["DT", "NN", "VBZ", "."] pos = ["DET", "NOUN", "VERB", "PUNCT"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) + doc = Doc( + en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas + ) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" @@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" assert doc[0].lemma_ == "the players" - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) + doc = Doc( + en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas + ) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" @@ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) with doc.retokenize() as retokenizer: for ent in doc.ents: attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_} @@ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): text = "One test with entities like New York City so the ents list is not void" heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) with doc.retokenize() as retokenizer: for ent in doc.ents: retokenizer.merge(ent) @@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] + deps = ["dep"] * len(heads) tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] ents = ["O"] * len(heads) @@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: on tokens = en_tokenizer(text) doc = Doc( - tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + deps=deps, + tags=tags, + ents=ents, ) assert len(doc) == 17 with doc.retokenize() as retokenizer: @@ -471,7 +484,7 @@ def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer): assert len(list(doc.sents)) == 2 with doc.retokenize() as retokenizer: retokenizer.merge(doc[3:6]) - assert doc[3].is_sent_start == None + assert doc[3].is_sent_start is None # merging over a sentence boundary and setting sent_start doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts) diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 6bfd508bc..16df1713d 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): # If lemmas are not set, leave unset words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) with doc.retokenize() as retokenizer: retokenizer.split( doc[0], @@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): # If lemmas are set, use split orth as default lemma words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) for t in doc: t.lemma_ = "a" with doc.retokenize() as retokenizer: diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 6a5689971..6e34f2126 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,4 +1,6 @@ import pytest +import numpy +from numpy.testing import assert_array_equal from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab @@ -14,9 +16,21 @@ def doc(en_tokenizer): heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12] deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", "attr", "punct", "ROOT", "det", "npadvmod", "punct"] + ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O", + "O", "O", "O", "O", "O"] # fmt: on tokens = en_tokenizer(text) - return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) + lemmas = [t.text for t in tokens] # this is not correct, just a placeholder + spaces = [bool(t.whitespace_) for t in tokens] + return Doc( + tokens.vocab, + words=[t.text for t in tokens], + spaces=spaces, + heads=heads, + deps=deps, + ents=ents, + lemmas=lemmas, + ) @pytest.fixture @@ -80,7 +94,7 @@ def test_spans_span_sent(doc, doc_not_parsed): """Test span.sent property""" assert len(list(doc.sents)) assert doc[:2].sent.root.text == "is" - assert doc[:2].sent.text == "This is a sentence ." + assert doc[:2].sent.text == "This is a sentence." assert doc[6:7].sent.root.left_edge.text == "This" # test on manual sbd doc_not_parsed[0].is_sent_start = True @@ -118,6 +132,17 @@ def test_spans_lca_matrix(en_tokenizer): assert lca[1, 0] == 1 # slept & dog -> slept assert lca[1, 1] == 1 # slept & slept -> slept + # example from Span API docs + tokens = en_tokenizer("I like New York in Autumn") + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[1, 1, 3, 1, 3, 4], + deps=["dep"] * len(tokens), + ) + lca = doc[1:4].get_lca_matrix() + assert_array_equal(lca, numpy.asarray([[0, 0, 0], [0, 1, 2], [0, 2, 2]])) + def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) @@ -220,10 +245,21 @@ def test_span_as_doc(doc): assert span_doc is not doc assert span_doc[0].idx == 0 + # partial initial entity is removed + assert len(span_doc.ents) == 0 + + # full entity is preserved + span_doc = doc[2:10].as_doc() + assert len(span_doc.ents) == 1 + + # partial final entity is removed + span_doc = doc[0:5].as_doc() + assert len(span_doc.ents) == 0 + @pytest.mark.usefixtures("clean_underscore") def test_span_as_doc_user_data(doc): - """Test that the user_data can be preserved (but not by default). """ + """Test that the user_data can be preserved (but not by default).""" my_key = "my_info" my_value = 342 doc.user_data[my_key] = my_value @@ -253,20 +289,13 @@ def test_span_string_label_kb_id(doc): assert span.kb_id == doc.vocab.strings["Q342"] -def test_span_label_readonly(doc): +def test_span_attrs_writable(doc): span = Span(doc, 0, 1) - with pytest.raises(NotImplementedError): - span.label_ = "hello" - - -def test_span_kb_id_readonly(doc): - span = Span(doc, 0, 1) - with pytest.raises(NotImplementedError): - span.kb_id_ = "Q342" + span.label_ = "label" + span.kb_id_ = "kb_id" def test_span_ents_property(doc): - """Test span.ents for the """ doc.ents = [ (doc.vocab.strings["PRODUCT"], 0, 1), (doc.vocab.strings["PRODUCT"], 7, 8), @@ -288,7 +317,7 @@ def test_span_ents_property(doc): assert sentences[1].ents[0].start == 7 assert sentences[1].ents[0].end == 8 # Third sentence ents, Also tests end of sentence - assert sentences[2].ents[0].text == "a third ." + assert sentences[2].ents[0].text == "a third." assert sentences[2].ents[0].label_ == "PRODUCT" assert sentences[2].ents[0].start == 11 assert sentences[2].ents[0].end == 14 @@ -341,6 +370,12 @@ def test_span_boundaries(doc): span[5] +def test_span_lemma(doc): + # span lemmas should have the same number of spaces as the span + sp = doc[1:5] + assert len(sp.text.split(" ")) == len(sp.lemma_.split(" ")) + + def test_sent(en_tokenizer): doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.") span = doc[1:3] diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 1e13882c5..5ea0bcff0 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab): # the structure of this sentence depends on the English annotation scheme words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."] heads = [2, 2, 2, 4, 2, 6, 4, 6, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert [t.text for t in doc[6].ancestors] == ["dog", "saw"] assert [t.text for t in doc[1].ancestors] == ["saw"] assert [t.text for t in doc[2].ancestors] == [] @@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab): assert doc[4].left_edge.i == 0 assert doc[2].left_edge.i == 0 # head token must be from the same document - doc2 = Doc(en_vocab, words=words, heads=heads) + doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) with pytest.raises(ValueError): doc[0].head = doc2[0] # test sentence starts when two sentences are joined @@ -254,7 +255,7 @@ def test_token_api_non_conjuncts(en_vocab): def test_missing_head_dep(en_vocab): - """ Check that the Doc constructor and Example.from_dict parse missing information the same""" + """Check that the Doc constructor and Example.from_dict parse missing information the same""" heads = [1, 1, 1, 1, 2, None] # element 5 is missing deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing words = ["I", "like", "London", "and", "Berlin", "."] diff --git a/spacy/tests/lang/bg/test_text.py b/spacy/tests/lang/bg/test_text.py index 3d35ba997..e3a29fe5d 100644 --- a/spacy/tests/lang/bg/test_text.py +++ b/spacy/tests/lang/bg/test_text.py @@ -1,5 +1,5 @@ import pytest -from spacy.lang.bg.lex_attrs import like_num + @pytest.mark.parametrize( "word,match", diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 83a75f056..a3c76ab5b 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -5,7 +5,7 @@ import pytest "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])] ) def test_contractions(ca_tokenizer, text, expected_tokens): - """ Test that the contractions are split into two tokens""" + """Test that the contractions are split into two tokens""" tokens = ca_tokenizer(text) assert len(tokens) == 2 assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index 733e814f7..358f4c0f9 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -56,7 +56,9 @@ def test_lex_attrs_like_number(en_tokenizer, text, match): assert tokens[0].like_num == match -@pytest.mark.parametrize("word", ["third", "Millionth", "100th", "Hundredth"]) +@pytest.mark.parametrize( + "word", ["third", "Millionth", "100th", "Hundredth", "23rd", "52nd"] +) def test_en_lex_attrs_like_number_for_ordinal(word): assert like_num(word) diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index ae16c7eea..dc40e18a3 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -36,6 +36,24 @@ ABBREVIATION_INFLECTION_TESTS = [ ("EU:n toimesta tehtiin jotain.", ["EU:n", "toimesta", "tehtiin", "jotain", "."]), ] +CONTRACTION_TESTS = [ + ( + "Päätimme ettemme tule.", + ["Päätimme", "ett", "emme", "tule", "."], + ["päätimme", "että", "emme", "tule", "."], + ), + ( + "Miksei puhuttaisi?", + ["Miks", "ei", "puhuttaisi", "?"], + ["miksi", "ei", "puhuttaisi", "?"], + ), + ( + "He tottelivat vaikkeivat halunneet", + ["He", "tottelivat", "vaikk", "eivat", "halunneet"], + ["he", "tottelivat", "vaikka", "eivät", "halunneet"], + ), +] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): @@ -56,3 +74,12 @@ def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_toke tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens,expected_norms", CONTRACTION_TESTS) +def test_fi_tokenizer_contractions(fi_tokenizer, text, expected_tokens, expected_norms): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + norm_list = [token.norm_ for token in tokens if not token.is_space] + assert expected_tokens == token_list + assert expected_norms == norm_list diff --git a/spacy/tests/lang/grc/__init__.py b/spacy/tests/lang/grc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/grc/test_text.py b/spacy/tests/lang/grc/test_text.py new file mode 100644 index 000000000..5d8317c36 --- /dev/null +++ b/spacy/tests/lang/grc/test_text.py @@ -0,0 +1,23 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("ι", True), + ("α", True), + ("ϟα", True), + ("ἑκατόν", True), + ("ἐνακόσια", True), + ("δισχίλια", True), + ("μύρια", True), + ("εἷς", True), + ("λόγος", False), + (",", False), + ("λβ", True), + ], +) +def test_lex_attrs_like_number(grc_tokenizer, text, match): + tokens = grc_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py index 46f66b5e6..5834f9695 100644 --- a/spacy/tests/lang/it/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -5,7 +5,7 @@ import pytest "text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])] ) def test_contractions(it_tokenizer, text, expected_tokens): - """ Test that the contractions are split into two tokens""" + """Test that the contractions are split into two tokens""" tokens = it_tokenizer(text) assert len(tokens) == 2 assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/nl/test_noun_chunks.py b/spacy/tests/lang/nl/test_noun_chunks.py new file mode 100644 index 000000000..73b501e4a --- /dev/null +++ b/spacy/tests/lang/nl/test_noun_chunks.py @@ -0,0 +1,209 @@ +from spacy.tokens import Doc +import pytest + + +@pytest.fixture +def nl_sample(nl_vocab): + # TEXT : + # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen. + # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook + # geen avondeten gekocht. + words = [ + "Haar", + "vriend", + "lacht", + "luid", + ".", + "We", + "kregen", + "alweer", + "ruzie", + "toen", + "we", + "de", + "supermarkt", + "ingingen", + ".", + "Aan", + "het", + "begin", + "van", + "de", + "supermarkt", + "is", + "al", + "het", + "fruit", + "en", + "de", + "groentes", + ".", + "Uiteindelijk", + "hebben", + "we", + "dan", + "ook", + "geen", + "avondeten", + "gekocht", + ".", + ] + heads = [ + 1, + 2, + 2, + 2, + 2, + 6, + 6, + 6, + 6, + 13, + 13, + 12, + 13, + 6, + 6, + 17, + 17, + 24, + 20, + 20, + 17, + 24, + 24, + 24, + 24, + 27, + 27, + 24, + 24, + 36, + 36, + 36, + 36, + 36, + 35, + 36, + 36, + 36, + ] + deps = [ + "nmod:poss", + "nsubj", + "ROOT", + "advmod", + "punct", + "nsubj", + "ROOT", + "advmod", + "obj", + "mark", + "nsubj", + "det", + "obj", + "advcl", + "punct", + "case", + "det", + "obl", + "case", + "det", + "nmod", + "cop", + "advmod", + "det", + "ROOT", + "cc", + "det", + "conj", + "punct", + "advmod", + "aux", + "nsubj", + "advmod", + "advmod", + "det", + "obj", + "ROOT", + "punct", + ] + pos = [ + "PRON", + "NOUN", + "VERB", + "ADJ", + "PUNCT", + "PRON", + "VERB", + "ADV", + "NOUN", + "SCONJ", + "PRON", + "DET", + "NOUN", + "NOUN", + "PUNCT", + "ADP", + "DET", + "NOUN", + "ADP", + "DET", + "NOUN", + "AUX", + "ADV", + "DET", + "NOUN", + "CCONJ", + "DET", + "NOUN", + "PUNCT", + "ADJ", + "AUX", + "PRON", + "ADV", + "ADV", + "DET", + "NOUN", + "VERB", + "PUNCT", + ] + return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos) + + +@pytest.fixture +def nl_reference_chunking(): + # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES: + return [ + "haar vriend", + "we", + "ruzie", + "we", + "de supermarkt", + "het begin", + "de supermarkt", + "het fruit", + "de groentes", + "we", + "geen avondeten", + ] + + +def test_need_dep(nl_tokenizer): + """ + Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed. + """ + txt = "Haar vriend lacht luid." + doc = nl_tokenizer(txt) + + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +def test_chunking(nl_sample, nl_reference_chunking): + """ + Test the noun chunks of a sample text. Uses a sample. + The sample text simulates a Doc object as would be produced by nl_core_news_md. + """ + chunks = [s.text.lower() for s in nl_sample.noun_chunks] + assert chunks == nl_reference_chunking diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 46f1f2bd1..36f4a75e0 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -4,12 +4,13 @@ from spacy.util import get_lang_class # fmt: off # Only include languages with no external dependencies -# excluded: ja, ru, th, uk, vi, zh -LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", - "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", - "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur", - "yo"] +# excluded: ja, ko, th, vi, zh +LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", + "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", + "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", + "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", + "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", + "tr", "tt", "uk", "ur", "xx", "yo"] # fmt: on diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py new file mode 100644 index 000000000..4a787b2a6 --- /dev/null +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -0,0 +1,7 @@ +from spacy.tokens import Doc + + +def test_uk_lemmatizer(uk_lemmatizer): + """Check that the default uk lemmatizer runs.""" + doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) + uk_lemmatizer(doc) diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py index 3ee5333fb..ed4652df7 100644 --- a/spacy/tests/lang/vi/test_serialize.py +++ b/spacy/tests/lang/vi/test_serialize.py @@ -23,11 +23,11 @@ def test_vi_tokenizer_serialize(vi_tokenizer): nlp_r = Vietnamese() nlp_r.from_bytes(nlp_bytes) assert nlp_bytes == nlp_r.to_bytes() - assert nlp_r.tokenizer.use_pyvi == False + assert nlp_r.tokenizer.use_pyvi is False with make_tempdir() as d: nlp.to_disk(d) nlp_r = Vietnamese() nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() - assert nlp_r.tokenizer.use_pyvi == False + assert nlp_r.tokenizer.use_pyvi is False diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index fb9222aaa..0e1eae588 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -354,7 +354,6 @@ def test_dependency_matcher_span_user_data(en_tokenizer): for token in doc: token.head = doc[0] token.dep_ = "a" - get_is_c = lambda token: token.text in ("c",) Token.set_extension("is_c", default=False) doc[2]._.is_c = True pattern = [ diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 4e6b4bfae..e0f655bbe 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text): assert len(matches) == 1 +@pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) @@ -522,6 +523,12 @@ def test_matcher_as_spans(matcher): assert matches[1].text == "Java" assert matches[1].label_ == "Java" + matches = matcher(doc[1:], as_spans=True) + assert len(matches) == 1 + assert isinstance(matches[0], Span) + assert matches[0].text == "Java" + assert matches[0].label_ == "Java" + def test_matcher_deprecated(matcher): doc = Doc(matcher.vocab, words=["hello", "world"]) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 9f575fe05..dcbe1ff33 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab): assert texts == ["zero", "one", "two"] +@pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_remove(): nlp = English() matcher = Matcher(nlp.vocab) @@ -254,13 +255,23 @@ def test_matcher_with_alignments_nongreedy(en_vocab): (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]), (1, "baab", "b a* b", [[0, 1, 1, 2]]), (2, "aaab", "a a a b", [[0, 1, 2, 3]]), - (3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]), + (3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]), (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]), - (5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]), + ( + 5, + "aabaa", + "a+ b a+", + [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2]], + ), (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]), (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]), (8, "baab", "b a* b b*", [[0, 1, 1, 2]]), - (9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]), + ( + 9, + "aabb", + "a* b* a*", + [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]], + ), (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]), (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]), (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]), diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 82c39b72c..8e042c9cf 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -11,6 +11,7 @@ def test_build_dependencies(): "mock", "flake8", "hypothesis", + "pre-commit", ] # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index e955a12a8..f89e993e9 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -18,14 +18,9 @@ def vocab(): @pytest.fixture def parser(vocab): - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model, **config) + parser = DependencyParser(vocab, model) return parser @@ -77,19 +72,14 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner1 = EntityRecognizer(Vocab(), model, **config) + ner1 = EntityRecognizer(Vocab(), model) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.initialize(lambda: [_ner_example(ner1)]) - ner2 = EntityRecognizer(Vocab(), model, **config) + ner2 = EntityRecognizer(Vocab(), model) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) @@ -113,12 +103,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config): """ labels = ["A", "B", "C"] model = registry.resolve({"model": model_config}, validate=True)["model"] - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } - pipe = pipe_cls(Vocab(), model, **config) + pipe = pipe_cls(Vocab(), model) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 66c22c60b..cba6fa81e 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -130,14 +130,9 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(doc.vocab, model, **config) + parser = DependencyParser(doc.vocab, model) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 1b9d0b255..a30001b27 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -8,12 +8,13 @@ from spacy.language import Language from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example -from spacy.tokens import Doc -from spacy.vocab import Vocab +from spacy.tokens import Doc, Span +from spacy.vocab import Vocab, registry import logging from ..util import make_tempdir - +from ...pipeline import EntityRecognizer +from ...pipeline.ner import DEFAULT_NER_MODEL TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), @@ -21,6 +22,11 @@ TRAIN_DATA = [ ] +@pytest.fixture +def neg_key(): + return "non_entities" + + @pytest.fixture def vocab(): return Vocab() @@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots): assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"] -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): - entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] +def test_negative_samples_two_word_input(tsys, vocab, neg_key): + """Test that we don't get stuck in a two word input when we have a negative + span. This could happen if we don't have the right check on the B action. + """ + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A", "B"]) + entity_annots = [None, None] example = Example.from_dict(doc, {"entities": entity_annots}) - ex_dict = example.to_dict() - - for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]): - if tag == "L-!GPE": - ex_dict["doc_annotation"]["entities"][i] = "-" - example = Example.from_dict(doc, ex_dict) - + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 2, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[0] != "B-PERSON" + assert names[1] != "L-PERSON" -def test_get_oracle_moves_negative_entities2(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] +def test_negative_samples_three_word_input(tsys, vocab, neg_key): + """Test that we exclude a 2-word entity correctly using a negative example.""" + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A", "B", "C"]) + entity_annots = [None, None, None] example = Example.from_dict(doc, {"entities": entity_annots}) + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 2, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[1] != "B-PERSON" -@pytest.mark.skip(reason="Maybe outdated? Unsure") -def test_get_oracle_moves_negative_O(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["O", "!O", "O", "!O"] +def test_negative_samples_U_entity(tsys, vocab, neg_key): + """Test that we exclude a 2-word entity correctly using a negative example.""" + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A"]) + entity_annots = [None] example = Example.from_dict(doc, {"entities": entity_annots}) + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 1, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[0] != "U-PERSON" + + +def test_negative_sample_key_is_in_config(vocab, entity_types): + actions = BiluoPushDown.get_actions(entity_types=entity_types) + tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities") + assert tsys.cfg["neg_key"] == "non_entities" # We can't easily represent this on a Doc object. Not sure what the best solution @@ -213,6 +250,27 @@ def test_train_empty(): nlp.update(batch, losses=losses) +def test_train_negative_deprecated(): + """Test that the deprecated negative entity format raises a custom error.""" + train_data = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}), + ] + + nlp = English() + train_examples = [] + for t in train_data: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + ner = nlp.add_pipe("ner", last=True) + ner.add_label("PERSON") + nlp.initialize() + for itn in range(2): + losses = {} + batches = util.minibatch(train_examples, size=8) + for batch in batches: + with pytest.raises(ValueError): + nlp.update(batch, losses=losses) + + def test_overwrite_token(): nlp = English() nlp.add_pipe("ner") @@ -246,7 +304,7 @@ def test_empty_ner(): def test_ruler_before_ner(): - """ Test that an NER works after an entity_ruler: the second can add annotations """ + """Test that an NER works after an entity_ruler: the second can add annotations""" nlp = English() # 1 : Entity Ruler - should set "this" to B and everything else to empty @@ -265,8 +323,18 @@ def test_ruler_before_ner(): assert [token.ent_type_ for token in doc] == expected_types +def test_ner_constructor(en_vocab): + config = { + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_NER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + EntityRecognizer(en_vocab, model, **config) + EntityRecognizer(en_vocab, model) + + def test_ner_before_ruler(): - """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """ + """Test that an entity_ruler works after an NER: the second can overwrite O annotations""" nlp = English() # 1: untrained NER - should set everything to O @@ -287,7 +355,7 @@ def test_ner_before_ruler(): def test_block_ner(): - """ Test functionality for blocking tokens so they can't be in a named entity """ + """Test functionality for blocking tokens so they can't be in a named entity""" # block "Antti L Korhonen" from being a named entity nlp = English() nlp.add_pipe("blocker", config={"start": 2, "end": 5}) @@ -358,6 +426,26 @@ def test_overfitting_IO(use_upper): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + # test that kb_id is preserved + test_text = "I like London and London." + doc = nlp.make_doc(test_text) + doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)] + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + assert ents[0].kb_id == 1234 + doc = nlp.get_pipe("ner")(doc) + ents = doc.ents + assert len(ents) == 2 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + assert ents[0].kb_id == 1234 + # ent added by ner has kb_id == 0 + assert ents[1].text == "London" + assert ents[1].label_ == "LOC" + assert ents[1].kb_id == 0 + def test_beam_ner_scores(): # Test that we can get confidence values out of the beam_ner pipe @@ -394,7 +482,7 @@ def test_beam_ner_scores(): assert 0 - eps <= score <= 1 + eps -def test_beam_overfitting_IO(): +def test_beam_overfitting_IO(neg_key): # Simple test to try and quickly overfit the Beam NER component nlp = English() beam_width = 16 @@ -402,6 +490,7 @@ def test_beam_overfitting_IO(): config = { "beam_width": beam_width, "beam_density": beam_density, + "incorrect_spans_key": neg_key, } ner = nlp.add_pipe("beam_ner", config=config) train_examples = [] @@ -418,12 +507,13 @@ def test_beam_overfitting_IO(): assert losses["beam_ner"] < 0.0001 # test the scores from the beam - test_text = "I like London." + test_text = "I like London" docs = [nlp.make_doc(test_text)] beams = ner.predict(docs) entity_scores = ner.scored_ents(beams)[0] assert entity_scores[(2, 3, "LOC")] == 1.0 assert entity_scores[(2, 3, "PERSON")] == 0.0 + assert len(nlp(test_text).ents) == 1 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -436,6 +526,108 @@ def test_beam_overfitting_IO(): assert entity_scores2[(2, 3, "LOC")] == 1.0 assert entity_scores2[(2, 3, "PERSON")] == 0.0 + # Try to unlearn the entity by using negative annotations + neg_doc = nlp.make_doc(test_text) + neg_ex = Example(neg_doc, neg_doc) + neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")] + neg_train_examples = [neg_ex] + + for i in range(20): + losses = {} + nlp.update(neg_train_examples, sgd=optimizer, losses=losses) + + # test the "untrained" model + assert len(nlp(test_text).ents) == 0 + + +def test_neg_annotation(neg_key): + """Check that the NER update works with a negative annotation that is a different label of the correct one, + or partly overlapping, etc""" + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + ner = nlp.add_pipe("beam_ner", config=config) + train_text = "Who is Shaka Khan?" + neg_doc = nlp.make_doc(train_text) + ner.add_label("PERSON") + ner.add_label("ORG") + example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) + example.reference.spans[neg_key] = [ + Span(neg_doc, 2, 4, "ORG"), + Span(neg_doc, 2, 3, "PERSON"), + Span(neg_doc, 1, 4, "PERSON"), + ] + + optimizer = nlp.initialize() + for i in range(2): + losses = {} + nlp.update([example], sgd=optimizer, losses=losses) + + +def test_neg_annotation_conflict(neg_key): + # Check that NER raises for a negative annotation that is THE SAME as a correct one + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + ner = nlp.add_pipe("beam_ner", config=config) + train_text = "Who is Shaka Khan?" + neg_doc = nlp.make_doc(train_text) + ner.add_label("PERSON") + ner.add_label("LOC") + example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) + example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")] + assert len(example.reference.ents) == 1 + assert example.reference.ents[0].text == "Shaka Khan" + assert example.reference.ents[0].label_ == "PERSON" + assert len(example.reference.spans[neg_key]) == 1 + assert example.reference.spans[neg_key][0].text == "Shaka Khan" + assert example.reference.spans[neg_key][0].label_ == "PERSON" + + optimizer = nlp.initialize() + for i in range(2): + losses = {} + with pytest.raises(ValueError): + nlp.update([example], sgd=optimizer, losses=losses) + + +def test_beam_valid_parse(neg_key): + """Regression test for previously flakey behaviour""" + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + nlp.add_pipe("beam_ner", config=config) + # fmt: off + tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.'] + iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + # fmt: on + + doc = Doc(nlp.vocab, words=tokens) + example = Example.from_dict(doc, {"ner": iob}) + neg_span = Span(doc, 50, 53, "ORG") + example.reference.spans[neg_key] = [neg_span] + + optimizer = nlp.initialize() + + for i in range(5): + losses = {} + nlp.update([example], sgd=optimizer, losses=losses) + assert "beam_ner" in losses + def test_ner_warns_no_lookups(caplog): nlp = English() diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index dc878dd7a..b7575d063 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -5,10 +5,11 @@ from spacy.attrs import DEP from spacy.lang.en import English from spacy.training import Example from spacy.tokens import Doc -from spacy import util +from spacy import util, registry from ..util import apply_transition_sequence, make_tempdir - +from ...pipeline import DependencyParser +from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL TRAIN_DATA = [ ( @@ -215,6 +216,18 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent +def test_parser_constructor(en_vocab): + config = { + "learn_tokens": False, + "min_action_freq": 30, + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + DependencyParser(en_vocab, model, **config) + DependencyParser(en_vocab, model) + + @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) def test_incomplete_data(pipe_name): # Test that the parser works with incomplete information diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 8ca4039a2..50da60594 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -69,7 +69,7 @@ def heads(): def test_parser_parse_navigate_consistency(en_vocab, words, heads): - doc = Doc(en_vocab, words=words, heads=heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) for head in doc: for child in head.lefts: assert child.head == head @@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads): def test_parser_parse_navigate_edges(en_vocab, words, heads): - doc = Doc(en_vocab, words=words, heads=heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) for token in doc: subtree = list(token.subtree) debug = "\t".join((token.text, token.left_edge.text, subtree[0].text)) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 595bfa537..d71388900 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -23,14 +23,9 @@ def _parser_example(parser): @pytest.fixture def parser(vocab): vocab.strings.add("ROOT") - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model, **config) + parser = DependencyParser(vocab, model) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py index b17855d85..869b8b874 100644 --- a/spacy/tests/pipeline/test_annotates_on_update.py +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -1,6 +1,5 @@ from typing import Callable, Iterable, Iterator import pytest -import io from thinc.api import Config from spacy.language import Language @@ -75,7 +74,7 @@ def test_annotates_on_update(): nlp.add_pipe("assert_sents") # When the pipeline runs, annotations are set - doc = nlp("This is a sentence.") + nlp("This is a sentence.") examples = [] for text in ["a a", "b b", "c c"]: diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4883cceb8..b97795344 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -2,7 +2,7 @@ from typing import Callable, Iterable import pytest from numpy.testing import assert_equal from spacy.attrs import ENT_KB_ID - +from spacy.compat import pickle from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.vocab import Vocab @@ -254,7 +254,9 @@ def test_nel_nsents(nlp): """Test that n_sents can be set through the configuration""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert entity_linker.n_sents == 0 - entity_linker = nlp.replace_pipe("entity_linker", "entity_linker", config={"n_sents": 2}) + entity_linker = nlp.replace_pipe( + "entity_linker", "entity_linker", config={"n_sents": 2} + ) assert entity_linker.n_sents == 2 @@ -290,6 +292,9 @@ def test_vocab_serialization(nlp): assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" + assert kb_new_vocab.get_vector("Q2") == [2] + assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) + def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" @@ -321,6 +326,7 @@ def test_append_alias(nlp): assert len(mykb.get_alias_candidates("douglas")) == 3 +@pytest.mark.filterwarnings("ignore:\\[W036") def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -339,6 +345,7 @@ def test_append_invalid_alias(nlp): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) +@pytest.mark.filterwarnings("ignore:\\[W036") def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" vector_length = 1 @@ -546,6 +553,106 @@ def test_kb_serialization(): assert "RandomWord" in nlp2.vocab.strings +@pytest.mark.xfail(reason="Needs fixing") +def test_kb_pickle(): + # Test that the KB can be pickled + nlp = English() + kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + assert not kb_1.contains_alias("Russ Cochran") + kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + assert kb_1.contains_alias("Russ Cochran") + data = pickle.dumps(kb_1) + kb_2 = pickle.loads(data) + assert kb_2.contains_alias("Russ Cochran") + + +@pytest.mark.xfail(reason="Needs fixing") +def test_nel_pickle(): + # Test that a pipeline with an EL component can be pickled + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=3) + kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + return kb + + nlp_1 = English() + nlp_1.add_pipe("ner") + entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) + entity_linker_1.set_kb(create_kb) + assert nlp_1.pipe_names == ["ner", "entity_linker"] + assert entity_linker_1.kb.contains_alias("Russ Cochran") + + data = pickle.dumps(nlp_1) + nlp_2 = pickle.loads(data) + assert nlp_2.pipe_names == ["ner", "entity_linker"] + entity_linker_2 = nlp_2.get_pipe("entity_linker") + assert entity_linker_2.kb.contains_alias("Russ Cochran") + + +def test_kb_to_bytes(): + # Test that the KB's to_bytes method works correctly + nlp = English() + kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) + kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) + kb_1.add_alias( + alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2] + ) + assert kb_1.contains_alias("Russ Cochran") + kb_bytes = kb_1.to_bytes() + kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + assert not kb_2.contains_alias("Russ Cochran") + kb_2 = kb_2.from_bytes(kb_bytes) + # check that both KBs are exactly the same + assert kb_1.get_size_entities() == kb_2.get_size_entities() + assert kb_1.entity_vector_length == kb_2.entity_vector_length + assert kb_1.get_entity_strings() == kb_2.get_entity_strings() + assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") + assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") + assert kb_2.contains_alias("Russ Cochran") + assert kb_1.get_size_aliases() == kb_2.get_size_aliases() + assert kb_1.get_alias_strings() == kb_2.get_alias_strings() + assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( + kb_2.get_alias_candidates("Russ Cochran") + ) + assert len(kb_1.get_alias_candidates("Randomness")) == len( + kb_2.get_alias_candidates("Randomness") + ) + + +def test_nel_to_bytes(): + # Test that a pipeline with an EL component can be converted to bytes + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=3) + kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + return kb + + nlp_1 = English() + nlp_1.add_pipe("ner") + entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) + entity_linker_1.set_kb(create_kb) + assert entity_linker_1.kb.contains_alias("Russ Cochran") + assert nlp_1.pipe_names == ["ner", "entity_linker"] + + nlp_bytes = nlp_1.to_bytes() + nlp_2 = English() + nlp_2.add_pipe("ner") + nlp_2.add_pipe("entity_linker", last=True) + assert nlp_2.pipe_names == ["ner", "entity_linker"] + assert not nlp_2.get_pipe("entity_linker").kb.contains_alias("Russ Cochran") + nlp_2 = nlp_2.from_bytes(nlp_bytes) + kb_2 = nlp_2.get_pipe("entity_linker").kb + assert kb_2.contains_alias("Russ Cochran") + assert kb_2.get_vector("Q2146908") == [6, -4, 3] + assert_almost_equal( + kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8 + ) + + def test_scorer_links(): train_examples = [] nlp = English() diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 4a01ce183..dc0ca0301 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -98,7 +98,8 @@ def test_entity_ruler_clear(nlp, patterns): assert len(doc.ents) == 1 ruler.clear() assert len(ruler.labels) == 0 - doc = nlp("hello world") + with pytest.warns(UserWarning): + doc = nlp("hello world") assert len(doc.ents) == 0 diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 1bec8696c..0d2d3d6e5 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -110,4 +110,4 @@ def test_lemmatizer_serialize(nlp): assert doc2[0].lemma_ == "cope" # Make sure that lemmatizer cache can be pickled - b = pickle.dumps(lemmatizer2) + pickle.dumps(lemmatizer2) diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index 302c307e2..e3fd28d0f 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -82,7 +82,9 @@ def util_batch_unbatch_docs_list( Y_batched = model.predict(in_data) Y_not_batched = [model.predict([u])[0] for u in in_data] for i in range(len(Y_batched)): - assert_almost_equal(OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4) + assert_almost_equal( + OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4 + ) def util_batch_unbatch_docs_array( diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index b99e9a863..f1f0c8a6e 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -160,7 +160,7 @@ def test_pipe_class_component_model(): "@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": { - "@architectures": "spacy.TextCatBOW.v1", + "@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False, @@ -351,8 +351,21 @@ def test_language_factories_invalid(): ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}), ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}), ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}), - ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}), - ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}), + ( + [{"a": 0.0, "b": 0.0}, {"c": 0.0}], + {"c": 0.2}, + {"a": 0.0, "b": 0.0, "c": 1.0}, + ), + ( + [{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], + {"a": 0.0, "b": 0.0}, + {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}, + ), + ( + [{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], + {"a": 0.0, "b": 0.0, "f": 0.0}, + {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0}, + ), ], ) def test_language_factories_combine_score_weights(weights, override, expected): diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0b84db4c0..87fd64307 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -52,7 +52,7 @@ def test_cant_add_pipe_first_and_last(nlp): nlp.add_pipe("new_pipe", first=True, last=True) -@pytest.mark.parametrize("name", ["my_component"]) +@pytest.mark.parametrize("name", ["test_get_pipe"]) def test_get_pipe(nlp, name): with pytest.raises(KeyError): nlp.get_pipe(name) @@ -62,7 +62,7 @@ def test_get_pipe(nlp, name): @pytest.mark.parametrize( "name,replacement,invalid_replacement", - [("my_component", "other_pipe", lambda doc: doc)], + [("test_replace_pipe", "other_pipe", lambda doc: doc)], ) def test_replace_pipe(nlp, name, replacement, invalid_replacement): with pytest.raises(ValueError): @@ -435,8 +435,8 @@ def test_update_with_annotates(): return component - c1 = Language.component(f"{name}1", func=make_component(f"{name}1")) - c2 = Language.component(f"{name}2", func=make_component(f"{name}2")) + Language.component(f"{name}1", func=make_component(f"{name}1")) + Language.component(f"{name}2", func=make_component(f"{name}2")) components = set([f"{name}1", f"{name}2"]) @@ -446,7 +446,12 @@ def test_update_with_annotates(): for text in texts: examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) - for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]: + for components_to_annotate in [ + [], + [f"{name}1"], + [f"{name}1", f"{name}2"], + [f"{name}2", f"{name}1"], + ]: for key in results: results[key] = "" nlp = English(vocab=nlp.vocab) diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py new file mode 100644 index 000000000..0364abf73 --- /dev/null +++ b/spacy/tests/pipeline/test_spancat.py @@ -0,0 +1,206 @@ +import pytest +from numpy.testing import assert_equal +from spacy.language import Language +from spacy.training import Example +from spacy.util import fix_random_seed, registry + + +SPAN_KEY = "labeled_spans" + +TRAIN_DATA = [ + ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}), + ( + "I like London and Berlin.", + {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC")]}}, + ), +] + + +def make_get_examples(nlp): + train_examples = [] + for t in TRAIN_DATA: + eg = Example.from_dict(nlp.make_doc(t[0]), t[1]) + train_examples.append(eg) + + def get_examples(): + return train_examples + + return get_examples + + +def test_no_label(): + nlp = Language() + nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + with pytest.raises(ValueError): + nlp.initialize() + + +def test_no_resize(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat.add_label("Thing") + spancat.add_label("Phrase") + assert spancat.labels == ("Thing", "Phrase") + nlp.initialize() + assert spancat.model.get_dim("nO") == 2 + # this throws an error because the spancat can't be resized after initialization + with pytest.raises(ValueError): + spancat.add_label("Stuff") + + +def test_implicit_labels(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + assert len(spancat.labels) == 0 + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + assert spancat.labels == ("PERSON", "LOC") + + +def test_explicit_labels(): + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + assert len(spancat.labels) == 0 + spancat.add_label("PERSON") + spancat.add_label("LOC") + nlp.initialize() + assert spancat.labels == ("PERSON", "LOC") + + +def test_simple_train(): + fix_random_seed(0) + nlp = Language() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + get_examples = make_get_examples(nlp) + nlp.initialize(get_examples) + sgd = nlp.create_optimizer() + assert len(spancat.labels) != 0 + for i in range(40): + losses = {} + nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd) + doc = nlp("I like London and Berlin.") + assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] + assert len(doc.spans[spancat.key]) == 2 + assert doc.spans[spancat.key][0].text == "London" + scores = nlp.evaluate(get_examples()) + assert f"spans_{SPAN_KEY}_f" in scores + assert scores[f"spans_{SPAN_KEY}_f"] == 1.0 + + +def test_ngram_suggester(en_tokenizer): + # test different n-gram lengths + for size in [1, 2, 3]: + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[size]) + docs = [ + en_tokenizer(text) + for text in [ + "a", + "a b", + "a b c", + "a b c d", + "a b c d e", + "a " * 100, + ] + ] + ngrams = ngram_suggester(docs) + # span sizes are correct + for s in ngrams.data: + assert s[1] - s[0] == size + # spans are within docs + offset = 0 + for i, doc in enumerate(docs): + spans = ngrams.dataXd[offset : offset + ngrams.lengths[i]] + spans_set = set() + for span in spans: + assert 0 <= span[0] < len(doc) + assert 0 < span[1] <= len(doc) + spans_set.add((span[0], span[1])) + # spans are unique + assert spans.shape[0] == len(spans_set) + offset += ngrams.lengths[i] + # the number of spans is correct + assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs]) + + # test 1-3-gram suggestions + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3]) + docs = [ + en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"] + ] + ngrams = ngram_suggester(docs) + assert_equal(ngrams.lengths, [1, 3, 6, 9, 12]) + assert_equal( + ngrams.data, + [ + # doc 0 + [0, 1], + # doc 1 + [0, 1], + [1, 2], + [0, 2], + # doc 2 + [0, 1], + [1, 2], + [2, 3], + [0, 2], + [1, 3], + [0, 3], + # doc 3 + [0, 1], + [1, 2], + [2, 3], + [3, 4], + [0, 2], + [1, 3], + [2, 4], + [0, 3], + [1, 4], + # doc 4 + [0, 1], + [1, 2], + [2, 3], + [3, 4], + [4, 5], + [0, 2], + [1, 3], + [2, 4], + [3, 5], + [0, 3], + [1, 4], + [2, 5], + ], + ) + + # test some empty docs + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1]) + docs = [en_tokenizer(text) for text in ["", "a", ""]] + ngrams = ngram_suggester(docs) + assert_equal(ngrams.lengths, [len(doc) for doc in docs]) + + # test all empty docs + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1]) + docs = [en_tokenizer(text) for text in ["", "", ""]] + ngrams = ngram_suggester(docs) + assert_equal(ngrams.lengths, [len(doc) for doc in docs]) + + +def test_ngram_sizes(en_tokenizer): + # test that the range suggester works well + size_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3]) + suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1") + range_suggester = suggester_factory(min_size=1, max_size=3) + docs = [ + en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"] + ] + ngrams_1 = size_suggester(docs) + ngrams_2 = range_suggester(docs) + assert_equal(ngrams_1.lengths, [1, 3, 6, 9, 12]) + assert_equal(ngrams_1.lengths, ngrams_2.lengths) + assert_equal(ngrams_1.data, ngrams_2.data) + + # one more variation + suggester_factory = registry.misc.get("spacy.ngram_range_suggester.v1") + range_suggester = suggester_factory(min_size=2, max_size=4) + ngrams_3 = range_suggester(docs) + assert_equal(ngrams_3.lengths, [0, 1, 3, 6, 9]) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 282961755..ec14b70da 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -132,8 +132,8 @@ def test_incomplete_data(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - assert doc[1].tag_ is "V" - assert doc[2].tag_ is "J" + assert doc[1].tag_ == "V" + assert doc[2].tag_ == "J" def test_overfitting_IO(): @@ -154,20 +154,20 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - assert doc[0].tag_ is "N" - assert doc[1].tag_ is "V" - assert doc[2].tag_ is "J" - assert doc[3].tag_ is "N" + assert doc[0].tag_ == "N" + assert doc[1].tag_ == "V" + assert doc[2].tag_ == "J" + assert doc[3].tag_ == "N" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert doc2[0].tag_ is "N" - assert doc2[1].tag_ is "V" - assert doc2[2].tag_ is "J" - assert doc2[3].tag_ is "N" + assert doc2[0].tag_ == "N" + assert doc2[1].tag_ == "V" + assert doc2[2].tag_ == "J" + assert doc2[3].tag_ == "N" # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ @@ -182,6 +182,17 @@ def test_overfitting_IO(): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + # Try to unlearn the first 'N' tag with negative annotation + neg_ex = Example.from_dict(nlp.make_doc(test_text), {"tags": ["!N", "V", "J", "N"]}) + + for i in range(20): + losses = {} + nlp.update([neg_ex], sgd=optimizer, losses=losses) + + # test the "untrained" tag + doc3 = nlp(test_text) + assert doc3[0].tag_ != "N" + def test_tagger_requires_labels(): nlp = English() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 43dfff147..b134b8508 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -108,6 +108,12 @@ def test_label_types(name): textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) + # textcat requires at least two labels + if name == "textcat": + with pytest.raises(ValueError): + nlp.initialize() + else: + nlp.initialize() @pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) @@ -131,19 +137,129 @@ def test_implicit_label(name, get_examples): nlp.initialize(get_examples=get_examples(nlp)) -@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) -def test_no_resize(name): +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # BOW + ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), + ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), + # ENSEMBLE + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}), + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}}), + # CNN + ("textcat", {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ], +) +# fmt: on +def test_no_resize(name, textcat_config): + """The old textcat architectures weren't resizable""" nlp = Language() - textcat = nlp.add_pipe(name) + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") nlp.initialize() - assert textcat.model.get_dim("nO") >= 2 + assert textcat.model.maybe_get_dim("nO") in [2, None] # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL") +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # BOW + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), + # CNN + ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ], +) +# fmt: on +def test_resize(name, textcat_config): + """The new textcat architectures are resizable""" + nlp = Language() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + assert textcat.model.maybe_get_dim("nO") in [2, None] + nlp.initialize() + assert textcat.model.maybe_get_dim("nO") in [2, None] + textcat.add_label("NEUTRAL") + assert textcat.model.maybe_get_dim("nO") in [3, None] + + +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # BOW + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), + ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), + # CNN + ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ], +) +# fmt: on +def test_resize_same_results(name, textcat_config): + # Ensure that the resized textcat classifiers still produce the same results for old labels + fix_random_seed(0) + nlp = English() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + assert textcat.model.maybe_get_dim("nO") in [2, None] + + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # test the trained model before resizing + test_text = "I am happy." + doc = nlp(test_text) + assert len(doc.cats) == 2 + pos_pred = doc.cats["POSITIVE"] + neg_pred = doc.cats["NEGATIVE"] + + # test the trained model again after resizing + textcat.add_label("NEUTRAL") + doc = nlp(test_text) + assert len(doc.cats) == 3 + assert doc.cats["POSITIVE"] == pos_pred + assert doc.cats["NEGATIVE"] == neg_pred + assert doc.cats["NEUTRAL"] <= 1 + + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # test the trained model again after training further with new label + doc = nlp(test_text) + assert len(doc.cats) == 3 + assert doc.cats["POSITIVE"] != pos_pred + assert doc.cats["NEGATIVE"] != neg_pred + for cat in doc.cats: + assert doc.cats[cat] <= 1 + + def test_error_with_multi_labels(): nlp = Language() nlp.add_pipe("textcat") @@ -286,14 +402,14 @@ def test_overfitting_IO_multi(): @pytest.mark.parametrize( "name,train_data,textcat_config", [ - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), - ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), - ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], ) # fmt: on @@ -401,7 +517,9 @@ def test_textcat_threshold(): macro_f = scores["cats_score"] assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 - scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"}) + scores = nlp.evaluate( + train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"} + ) pos_f = scores["cats_score"] assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 assert pos_f > macro_f diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index e3b71c502..eeea906bb 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -129,8 +129,14 @@ cfg_string = """ """ TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ( + "I like green eggs", + {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}, + ), + ( + "Eat blue ham", + {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}, + ), ] @@ -218,6 +224,13 @@ def test_replace_listeners(): nlp.replace_listeners("tok2vec", "tagger", ["model.yolo"]) with pytest.raises(ValueError): nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec", "model.yolo"]) + # attempt training with the new pipeline + optimizer = nlp.initialize(lambda: examples) + for i in range(2): + losses = {} + nlp.update(examples, sgd=optimizer, losses=losses) + assert losses["tok2vec"] == 0.0 + assert losses["tagger"] > 0.0 cfg_string_multi = """ @@ -311,3 +324,92 @@ def test_replace_listeners_from_config(): new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] == "spacy.Tok2VecListener.v1" ) + + +cfg_string_multi_textcat = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","textcat_multilabel","tagger"] + + [components] + + [components.textcat_multilabel] + factory = "textcat_multilabel" + + [components.textcat_multilabel.model] + @architectures = "spacy.TextCatEnsemble.v2" + nO = null + + [components.textcat_multilabel.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.textcat_multilabel.model.linear_model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = false + ngram_size = 1 + no_output_layer = false + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v1" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_listeners_textcat(): + orig_config = Config().from_str(cfg_string_multi_textcat) + nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.pipe_names == ["tok2vec", "textcat_multilabel", "tagger"] + tagger = nlp.get_pipe("tagger") + textcat = nlp.get_pipe("textcat_multilabel") + tok2vec = nlp.get_pipe("tok2vec") + tagger_tok2vec = tagger.model.get_ref("tok2vec") + textcat_tok2vec = textcat.model.get_ref("tok2vec") + assert isinstance(tok2vec, Tok2Vec) + assert isinstance(tagger_tok2vec, Tok2VecListener) + assert isinstance(textcat_tok2vec, Tok2VecListener) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + optimizer = nlp.initialize(lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + docs = list(nlp.pipe(["Eat blue ham", "I like green eggs"])) + cats0 = docs[0].cats + assert cats0["preference"] < 0.1 + assert cats0["imperative"] > 0.9 + cats1 = docs[1].cats + assert cats1["preference"] > 0.1 + assert cats1["imperative"] < 0.9 + assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] + assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 362ba67ae..e123d2df9 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -190,14 +190,9 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(doc.vocab, model, **config) + ner = EntityRecognizer(doc.vocab, model) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 0505571c2..71c3768dd 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -197,7 +197,7 @@ def test_issue3555(en_vocab): def test_issue3611(): - """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ + """Test whether adding n-grams in the textcat works even when n > token length of some docs""" unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", @@ -259,8 +259,6 @@ def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) @@ -274,8 +272,6 @@ def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" config = { "learn_tokens": True, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) @@ -286,7 +282,7 @@ def test_issue3830_with_subtok(): def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string """ + """Test that match IDs returned by the matcher are correct, are in the string""" doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) matcher = Matcher(en_vocab) match_id = "PATTERN" @@ -370,7 +366,7 @@ def test_issue3951(en_vocab): def test_issue3959(): - """ Ensure that a modified pos attribute is serialized correctly.""" + """Ensure that a modified pos attribute is serialized correctly.""" nlp = English() doc = nlp( "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index a4c15dac2..4410e6236 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -38,7 +38,7 @@ def test_issue4002(en_vocab): def test_issue4030(): - """ Test whether textcat works fine with empty doc """ + """Test whether textcat works fine with empty doc""" unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", @@ -237,7 +237,7 @@ def test_issue4190(): def test_issue4267(): - """ Test that running an entity_ruler after ner gives consistent results""" + """Test that running an entity_ruler after ner gives consistent results""" nlp = English() ner = nlp.add_pipe("ner") ner.add_label("PEOPLE") @@ -288,7 +288,7 @@ def test_multiple_predictions(): def test_issue4313(): - """ This should not crash or exit with some strange error code """ + """This should not crash or exit with some strange error code""" beam_width = 16 beam_density = 0.0001 nlp = English() diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index f5fcb53fd..effd67306 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -152,7 +152,7 @@ def test_issue4707(): def test_issue4725_1(): - """ Ensure the pickling of the NER goes well""" + """Ensure the pickling of the NER goes well""" vocab = Vocab(vectors_name="test_vocab_add_vector") nlp = English(vocab=vocab) config = { diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py index 0575c8270..bc9bcb982 100644 --- a/spacy/tests/regression/test_issue5001-5500.py +++ b/spacy/tests/regression/test_issue5001-5500.py @@ -69,9 +69,12 @@ def test_issue5082(): def test_issue5137(): - @Language.factory("my_component") + factory_name = "test_issue5137" + pipe_name = "my_component" + + @Language.factory(factory_name) class MyComponent: - def __init__(self, nlp, name="my_component", categories="all_categories"): + def __init__(self, nlp, name=pipe_name, categories="all_categories"): self.nlp = nlp self.categories = categories self.name = name @@ -86,17 +89,17 @@ def test_issue5137(): pass nlp = English() - my_component = nlp.add_pipe("my_component") + my_component = nlp.add_pipe(factory_name, name=pipe_name) assert my_component.categories == "all_categories" with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) - overrides = {"components": {"my_component": {"categories": "my_categories"}}} + overrides = {"components": {pipe_name: {"categories": "my_categories"}}} nlp2 = spacy.load(tmpdir, config=overrides) - assert nlp2.get_pipe("my_component").categories == "my_categories" + assert nlp2.get_pipe(pipe_name).categories == "my_categories" def test_issue5141(en_vocab): - """ Ensure an empty DocBin does not crash on serialization """ + """Ensure an empty DocBin does not crash on serialization""" doc_bin = DocBin(attrs=["DEP", "HEAD"]) assert list(doc_bin.get_docs(en_vocab)) == [] doc_bin_bytes = doc_bin.to_bytes() diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py index 3007f1dc6..f57e4085c 100644 --- a/spacy/tests/regression/test_issue6501-7000.py +++ b/spacy/tests/regression/test_issue6501-7000.py @@ -152,7 +152,8 @@ labels = ['label1', 'label2'] @pytest.mark.parametrize( - "component_name", ["textcat", "textcat_multilabel"], + "component_name", + ["textcat", "textcat_multilabel"], ) def test_issue6908(component_name): """Test intializing textcat with labels in a list""" diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py new file mode 100644 index 000000000..5bb7cc08e --- /dev/null +++ b/spacy/tests/regression/test_issue7001-8000.py @@ -0,0 +1,281 @@ +from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type +from spacy.lang.en import English +from spacy.training import Example +from spacy.tokens.doc import Doc +from spacy.vocab import Vocab +from spacy.kb import KnowledgeBase +from spacy.pipeline._parser_internals.arc_eager import ArcEager +from spacy.util import load_config_from_str, load_config +from spacy.cli.init_config import fill_config +from thinc.api import Config +from wasabi import msg + +from ..util import make_tempdir + + +def test_issue7019(): + scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} + print_textcats_auc_per_cat(msg, scores) + scores = { + "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, + "LABEL_B": {"p": None, "r": None, "f": None}, + } + print_prf_per_type(msg, scores, name="foo", type="bar") + + +CONFIG_7029 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch.""" + TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ] + nlp = English.from_config(load_config_from_str(CONFIG_7029)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] + + +def test_issue7055(): + """Test that fill-config doesn't turn sourced components into factories.""" + source_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, + "components": { + "tok2vec": {"factory": "tok2vec"}, + "tagger": {"factory": "tagger"}, + }, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + base_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": str(source_path)}, + "tagger": {"source": str(source_path)}, + "ner": {"factory": "ner"}, + }, + } + base_cfg = Config(base_cfg) + base_path = dir_path / "base.cfg" + base_cfg.to_disk(base_path) + output_path = dir_path / "config.cfg" + fill_config(output_path, base_path, silent=True) + filled_cfg = load_config(output_path) + assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) + assert filled_cfg["components"]["tagger"]["source"] == str(source_path) + assert filled_cfg["components"]["ner"]["factory"] == "ner" + assert "model" in filled_cfg["components"]["ner"] + + +def test_issue7056(): + """Test that the Unshift transition works properly, and doesn't cause + sentence segmentation errors.""" + vocab = Vocab() + ae = ArcEager( + vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + ) + doc = Doc(vocab, words="Severe pain , after trauma".split()) + state = ae.init_batch([doc])[0] + ae.apply_transition(state, "S") + ae.apply_transition(state, "L-amod") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "R-pobj") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + assert not state.eol() + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + TRAIN_DATA = [ + ( + "Russ Cochran his reprints include EC Comics.", + { + "links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"] + + +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + { + "label": "THING", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + } + ] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 + + +def test_issue7065_b(): + # Test that the NEL doesn't crash when an entity crosses a sentence boundary + nlp = English() + vector_length = 3 + nlp.add_pipe("sentencizer") + text = "Mahler 's Symphony No. 8 was beautiful." + entities = [(0, 6, "PERSON"), (10, 24, "WORK")] + links = { + (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, + (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, + } + sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + doc = nlp(text) + example = Example.from_dict( + doc, {"entities": entities, "links": links, "sent_starts": sent_starts} + ) + train_examples = [example] + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="No. 8", + entities=["Q270853"], + probabilities=[1.0], + ) + mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias( + alias="Mahler", + entities=["Q7304"], + probabilities=[1.0], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + # train the NEL pipe + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # Add a custom rule-based component to mimick NER + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, + { + "label": "WORK", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + }, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + # test the trained model - this should not throw E148 + doc = nlp(text) + assert doc diff --git a/spacy/tests/regression/test_issue7019.py b/spacy/tests/regression/test_issue7019.py deleted file mode 100644 index 53958b594..000000000 --- a/spacy/tests/regression/test_issue7019.py +++ /dev/null @@ -1,12 +0,0 @@ -from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type -from wasabi import msg - - -def test_issue7019(): - scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} - print_textcats_auc_per_cat(msg, scores) - scores = { - "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, - "LABEL_B": {"p": None, "r": None, "f": None}, - } - print_prf_per_type(msg, scores, name="foo", type="bar") diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py deleted file mode 100644 index 8435b32e1..000000000 --- a/spacy/tests/regression/test_issue7029.py +++ /dev/null @@ -1,66 +0,0 @@ -from spacy.lang.en import English -from spacy.training import Example -from spacy.util import load_config_from_str - - -CONFIG = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), -] - - -def test_issue7029(): - """Test that an empty document doesn't mess up an entire batch.""" - nlp = English.from_config(load_config_from_str(CONFIG)) - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(50): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] - docs1 = list(nlp.pipe(texts, batch_size=1)) - docs2 = list(nlp.pipe(texts, batch_size=4)) - assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] diff --git a/spacy/tests/regression/test_issue7055.py b/spacy/tests/regression/test_issue7055.py deleted file mode 100644 index c7ddb0a75..000000000 --- a/spacy/tests/regression/test_issue7055.py +++ /dev/null @@ -1,40 +0,0 @@ -from spacy.cli.init_config import fill_config -from spacy.util import load_config -from spacy.lang.en import English -from thinc.api import Config - -from ..util import make_tempdir - - -def test_issue7055(): - """Test that fill-config doesn't turn sourced components into factories.""" - source_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, - "components": { - "tok2vec": {"factory": "tok2vec"}, - "tagger": {"factory": "tagger"}, - }, - } - source_nlp = English.from_config(source_cfg) - with make_tempdir() as dir_path: - # We need to create a loadable source pipeline - source_path = dir_path / "test_model" - source_nlp.to_disk(source_path) - base_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, - "components": { - "tok2vec": {"source": str(source_path)}, - "tagger": {"source": str(source_path)}, - "ner": {"factory": "ner"}, - }, - } - base_cfg = Config(base_cfg) - base_path = dir_path / "base.cfg" - base_cfg.to_disk(base_path) - output_path = dir_path / "config.cfg" - fill_config(output_path, base_path, silent=True) - filled_cfg = load_config(output_path) - assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) - assert filled_cfg["components"]["tagger"]["source"] == str(source_path) - assert filled_cfg["components"]["ner"]["factory"] == "ner" - assert "model" in filled_cfg["components"]["ner"] diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py deleted file mode 100644 index 541144877..000000000 --- a/spacy/tests/regression/test_issue7056.py +++ /dev/null @@ -1,25 +0,0 @@ -from spacy.tokens.doc import Doc -from spacy.vocab import Vocab -from spacy.pipeline._parser_internals.arc_eager import ArcEager - - -def test_issue7056(): - """Test that the Unshift transition works properly, and doesn't cause - sentence segmentation errors.""" - vocab = Vocab() - ae = ArcEager( - vocab.strings, - ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) - ) - doc = Doc(vocab, words="Severe pain , after trauma".split()) - state = ae.init_batch([doc])[0] - ae.apply_transition(state, "S") - ae.apply_transition(state, "L-amod") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "R-pobj") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - assert not state.eol() diff --git a/spacy/tests/regression/test_issue7062.py b/spacy/tests/regression/test_issue7062.py deleted file mode 100644 index 88e5d2520..000000000 --- a/spacy/tests/regression/test_issue7062.py +++ /dev/null @@ -1,54 +0,0 @@ -from spacy.kb import KnowledgeBase -from spacy.training import Example -from spacy.lang.en import English - - -# fmt: off -TRAIN_DATA = [ - ("Russ Cochran his reprints include EC Comics.", - {"links": {(0, 12): {"Q2146908": 1.0}}, - "entities": [(0, 12, "PERSON")], - "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}) -] -# fmt: on - - -def test_partial_links(): - # Test that having some entities on the doc without gold links, doesn't crash - nlp = English() - vector_length = 3 - train_examples = [] - for text, annotation in TRAIN_DATA: - doc = nlp(text) - train_examples.append(Example.from_dict(doc, annotation)) - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) - return mykb - - # Create and train the Entity Linker - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # adding additional components that are required for the entity_linker - nlp.add_pipe("sentencizer", first=True) - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, - {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]} - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - - # this will run the pipeline on the examples and shouldn't crash - results = nlp.evaluate(train_examples) - assert "PERSON" in results["ents_per_type"] - assert "PERSON" in results["nel_f_per_type"] - assert "ORG" in results["ents_per_type"] - assert "ORG" not in results["nel_f_per_type"] diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py deleted file mode 100644 index 63d36552a..000000000 --- a/spacy/tests/regression/test_issue7065.py +++ /dev/null @@ -1,75 +0,0 @@ -from spacy.kb import KnowledgeBase -from spacy.lang.en import English -from spacy.training import Example - - -def test_issue7065(): - text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." - nlp = English() - nlp.add_pipe("sentencizer") - ruler = nlp.add_pipe("entity_ruler") - patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}] - ruler.add_patterns(patterns) - - doc = nlp(text) - sentences = [s for s in doc.sents] - assert len(sentences) == 2 - sent0 = sentences[0] - ent = doc.ents[0] - assert ent.start < sent0.end < ent.end - assert sentences.index(ent.sent) == 0 - - -def test_issue7065_b(): - # Test that the NEL doesn't crash when an entity crosses a sentence boundary - nlp = English() - vector_length = 3 - nlp.add_pipe("sentencizer") - - text = "Mahler 's Symphony No. 8 was beautiful." - entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = {(0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}} - sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] - doc = nlp(text) - example = Example.from_dict(doc, {"entities": entities, "links": links, "sent_starts": sent_starts}) - train_examples = [example] - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias( - alias="No. 8", - entities=["Q270853"], - probabilities=[1.0], - ) - mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias( - alias="Mahler", - entities=["Q7304"], - probabilities=[1.0], - ) - return mykb - - # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - - # train the NEL pipe - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # Add a custom rule-based component to mimick NER - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - {"label": "WORK", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]} - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - - # test the trained model - this should not throw E148 - doc = nlp(text) - assert doc diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py new file mode 100644 index 000000000..fbddf643c --- /dev/null +++ b/spacy/tests/regression/test_issue8168.py @@ -0,0 +1,22 @@ +from spacy.lang.en import English + + +def test_issue8168(): + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + {"label": "ORG", "pattern": "Apple"}, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], + "id": "san-francisco", + }, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], + "id": "san-francisco", + }, + ] + ruler.add_patterns(patterns) + + assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py new file mode 100644 index 000000000..6ddbe53e0 --- /dev/null +++ b/spacy/tests/regression/test_issue8190.py @@ -0,0 +1,21 @@ +import spacy +from spacy.lang.en import English +from ..util import make_tempdir + + +def test_issue8190(): + """Test that config overrides are not lost after load is complete.""" + source_cfg = { + "nlp": { + "lang": "en", + }, + "custom": {"key": "value"}, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) + + assert nlp.config["custom"]["key"] == "updated_value" diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py index 528d4b6f9..00cd6da3b 100644 --- a/spacy/tests/regression/test_issue8216.py +++ b/spacy/tests/regression/test_issue8216.py @@ -2,7 +2,6 @@ import pytest from spacy import registry from spacy.language import Language -from spacy.pipeline import EntityRuler @pytest.fixture diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 2cd0e4ab6..102989705 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -4,7 +4,12 @@ import spacy from spacy.lang.en import English from spacy.lang.de import German from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH -from spacy.util import registry, load_model_from_config, load_config, load_config_from_str +from spacy.util import ( + registry, + load_model_from_config, + load_config, + load_config_from_str, +) from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from spacy.schemas import ConfigSchema, ConfigSchemaPretrain @@ -233,7 +238,7 @@ def test_create_nlp_from_config_multiple_instances(): def test_serialize_nlp(): - """ Create a custom nlp pipeline from config and ensure it serializes it correctly """ + """Create a custom nlp pipeline from config and ensure it serializes it correctly""" nlp_config = Config().from_str(nlp_config_string) nlp = load_model_from_config(nlp_config, auto_fill=True) nlp.get_pipe("tagger").add_label("A") @@ -253,7 +258,7 @@ def test_serialize_nlp(): def test_serialize_custom_nlp(): - """ Create a custom nlp pipeline and ensure it serializes it correctly""" + """Create a custom nlp pipeline and ensure it serializes it correctly""" nlp = English() parser_cfg = dict() parser_cfg["model"] = {"@architectures": "my_test_parser"} @@ -274,7 +279,7 @@ def test_serialize_custom_nlp(): "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper] ) def test_serialize_parser(parser_config_string): - """ Create a non-default parser config to check nlp serializes it correctly """ + """Create a non-default parser config to check nlp serializes it correctly""" nlp = English() model_config = Config().from_str(parser_config_string) parser = nlp.add_pipe("parser", config=model_config) @@ -493,4 +498,4 @@ def test_hyphen_in_config(): self.punctuation = punctuation nlp = English.from_config(load_config_from_str(hyphen_config_str)) - assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-'] + assert nlp.get_pipe("my_punctual_component").punctuation == ["?", "-"] diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 837c128af..e51c7f45b 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -64,13 +64,17 @@ def test_serialize_doc_span_groups(en_vocab): def test_serialize_doc_bin(): - doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) + doc_bin = DocBin( + attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True + ) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc.spans["start"] = [doc[0:2]] + doc[0].norm_ = "UNUSUAL_TOKEN_NORM" + doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc_bin.add(doc) bytes_data = doc_bin.to_bytes() @@ -82,6 +86,8 @@ def test_serialize_doc_bin(): assert doc.text == texts[i] assert doc.cats == cats assert len(doc.spans) == 1 + assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" + assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" def test_serialize_doc_bin_unknown_spaces(en_vocab): diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index fb04d31a3..1e0ae3c76 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -3,6 +3,7 @@ from typing import Callable from spacy import util from spacy.util import ensure_path, registry, load_model_from_config from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab from thinc.api import Config from ..util import make_tempdir @@ -111,7 +112,7 @@ def test_serialize_subclassed_kb(): @registry.misc("spacy.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[["Vocab"], KnowledgeBase]: + ) -> Callable[[Vocab], KnowledgeBase]: def custom_kb_factory(vocab): kb = SubKnowledgeBase( vocab=vocab, diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 48c7082bb..c8162a690 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -60,18 +60,10 @@ def taggers(en_vocab): @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = Parser(en_vocab, model, **config) - new_parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) + new_parser = Parser(en_vocab, model) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) bytes_2 = new_parser.to_bytes(exclude=["vocab"]) bytes_3 = parser.to_bytes(exclude=["vocab"]) @@ -84,43 +76,27 @@ def test_serialize_parser_strings(Parser): vocab1 = Vocab() label = "FunnyLabel" assert label not in vocab1.strings - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser1 = Parser(vocab1, model, **config) + parser1 = Parser(vocab1, model) parser1.add_label(label) assert label in parser1.vocab.strings vocab2 = Vocab() assert label not in vocab2.strings - parser2 = Parser(vocab2, model, **config) + parser2 = Parser(vocab2, model) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) assert label in parser2.vocab.strings @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) - parser_d = Parser(en_vocab, model, **config) + parser_d = Parser(en_vocab, model) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) @@ -198,17 +174,12 @@ def test_serialize_textcat_empty(en_vocab): def test_serialize_pipe_exclude(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - } def get_new_parser(): - new_parser = Parser(en_vocab, model, **config) + new_parser = Parser(en_vocab, model) return new_parser - parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) parser.cfg["foo"] = "bar" new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"])) assert "foo" in new_parser.cfg diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index c9e451471..26eabd4e5 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -5,7 +5,6 @@ from catalogue import RegistryError def test_get_architecture(): - @registry.architectures("my_test_function") def create_model(nr_in, nr_out): return Linear(nr_in, nr_out) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 2013ceac4..6f0fdcfa5 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -10,11 +10,16 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import string_to_list +from spacy import about +from spacy.util import get_minor_version +from spacy.cli.validate import get_model_pkgs +from spacy.cli.download import get_compatibility, get_version from thinc.api import ConfigValidationError, Config import srsly import os from .util import make_tempdir +from ..cli.init_pipeline import _init_labels def test_cli_info(): @@ -308,7 +313,8 @@ def test_project_config_validation2(config, n_errors): @pytest.mark.parametrize( - "int_value", [10, pytest.param("10", marks=pytest.mark.xfail)], + "int_value", + [10, pytest.param("10", marks=pytest.mark.xfail)], ) def test_project_config_interpolation(int_value): variables = {"a": int_value, "b": {"c": "foo", "d": True}} @@ -331,7 +337,8 @@ def test_project_config_interpolation(int_value): @pytest.mark.parametrize( - "greeting", [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)], + "greeting", + [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)], ) def test_project_config_interpolation_override(greeting): variables = {"a": "world"} @@ -423,11 +430,17 @@ def test_parse_cli_overrides(): @pytest.mark.parametrize("pretraining", [True, False]) def test_init_config(lang, pipeline, optimize, pretraining): # TODO: add more tests and also check for GPU with transformers - config = init_config(lang=lang, pipeline=pipeline, optimize=optimize, pretraining=pretraining, gpu=False) + config = init_config( + lang=lang, + pipeline=pipeline, + optimize=optimize, + pretraining=pretraining, + gpu=False, + ) assert isinstance(config, Config) if pretraining: config["paths"]["raw_text"] = "my_data.jsonl" - nlp = load_model_from_config(config, auto_fill=True) + load_model_from_config(config, auto_fill=True) def test_model_recommendations(): @@ -474,3 +487,48 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] + + +def test_download_compatibility(): + model_name = "en_core_web_sm" + compatibility = get_compatibility() + version = get_version(model_name, compatibility) + assert get_minor_version(about.__version__) == get_minor_version(version) + + +def test_validate_compatibility_table(): + model_pkgs, compat = get_model_pkgs() + spacy_version = get_minor_version(about.__version__) + current_compat = compat.get(spacy_version, {}) + assert len(current_compat) > 0 + assert "en_core_web_sm" in current_compat + + +@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"]) +def test_init_labels(component_name): + nlp = Dutch() + component = nlp.add_pipe(component_name) + for label in ["T1", "T2", "T3", "T4"]: + component.add_label(label) + assert len(nlp.get_pipe(component_name).labels) == 4 + + with make_tempdir() as tmp_dir: + _init_labels(nlp, tmp_dir) + + config = init_config( + lang="nl", + pipeline=[component_name], + optimize="efficiency", + gpu=False, + ) + config["initialize"]["components"][component_name] = { + "labels": { + "@readers": "spacy.read_labels.v1", + "path": f"{tmp_dir}/{component_name}.json", + } + } + + nlp2 = load_model_from_config(config, auto_fill=True) + assert len(nlp2.get_pipe(component_name).labels) == 0 + nlp2.initialize() + assert len(nlp2.get_pipe(component_name).labels) == 4 diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 7fb03da0c..c911b8d81 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -15,6 +15,29 @@ from thinc.api import NumpyOps, get_current_ops from .util import add_vecs_to_vocab, assert_docs_equal +def evil_component(doc): + if "2" in doc.text: + raise ValueError("no dice") + return doc + + +def perhaps_set_sentences(doc): + if not doc.text.startswith("4"): + doc[-1].is_sent_start = True + return doc + + +def assert_sents_error(doc): + if not doc.has_annotation("SENT_START"): + raise ValueError("no sents") + return doc + + +def warn_error(proc_name, proc, docs, e): + logger = logging.getLogger("spacy") + logger.warning(f"Trouble with component {proc_name}.") + + @pytest.fixture def nlp(): nlp = Language(Vocab()) @@ -93,19 +116,16 @@ def test_evaluate_no_pipe(nlp): nlp.evaluate([Example.from_dict(doc, annots)]) -@Language.component("test_language_vector_modification_pipe") def vector_modification_pipe(doc): doc.vector += 1 return doc -@Language.component("test_language_userdata_pipe") def userdata_pipe(doc): doc.user_data["foo"] = "bar" return doc -@Language.component("test_language_ner_pipe") def ner_pipe(doc): span = Span(doc, 0, 1, label="FIRST") doc.ents += (span,) @@ -123,6 +143,11 @@ def sample_vectors(): @pytest.fixture def nlp2(nlp, sample_vectors): + Language.component( + "test_language_vector_modification_pipe", func=vector_modification_pipe + ) + Language.component("test_language_userdata_pipe", func=userdata_pipe) + Language.component("test_language_ner_pipe", func=ner_pipe) add_vecs_to_vocab(nlp.vocab, sample_vectors) nlp.add_pipe("test_language_vector_modification_pipe") nlp.add_pipe("test_language_ner_pipe") @@ -168,82 +193,115 @@ def test_language_pipe_stream(nlp2, n_process, texts): assert_docs_equal(doc, expected_doc) -def test_language_pipe_error_handler(): +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler(n_process): """Test that the error handling of nlp.pipe works well""" - nlp = English() - nlp.add_pipe("merge_subtokens") - nlp.initialize() - texts = ["Curious to see what will happen to this text.", "And this one."] - # the pipeline fails because there's no parser - with pytest.raises(ValueError): + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("merge_subtokens") + nlp.initialize() + texts = ["Curious to see what will happen to this text.", "And this one."] + # the pipeline fails because there's no parser + with pytest.raises(ValueError): + nlp(texts[0]) + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.set_error_handler(raise_error) + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + # set explicitely to ignoring + nlp.set_error_handler(ignore_error) + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 nlp(texts[0]) - with pytest.raises(ValueError): - list(nlp.pipe(texts)) - nlp.set_error_handler(raise_error) - with pytest.raises(ValueError): - list(nlp.pipe(texts)) - # set explicitely to ignoring - nlp.set_error_handler(ignore_error) - docs = list(nlp.pipe(texts)) - assert len(docs) == 0 - nlp(texts[0]) -def test_language_pipe_error_handler_custom(en_vocab): +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_custom(en_vocab, n_process): """Test the error handling of a custom component that has no pipe method""" + Language.component("my_evil_component", func=evil_component) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("my_evil_component") + texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] + with pytest.raises(ValueError): + # the evil custom component throws an error + list(nlp.pipe(texts)) - @Language.component("my_evil_component") - def evil_component(doc): - if "2" in doc.text: - raise ValueError("no dice") - return doc - - def warn_error(proc_name, proc, docs, e): - from spacy.util import logger - - logger.warning(f"Trouble with component {proc_name}.") - - nlp = English() - nlp.add_pipe("my_evil_component") - nlp.initialize() - texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] - with pytest.raises(ValueError): - # the evil custom component throws an error - list(nlp.pipe(texts)) - - nlp.set_error_handler(warn_error) - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: - # the errors by the evil custom component raise a warning for each bad batch - docs = list(nlp.pipe(texts)) - mock_warning.assert_called() - assert mock_warning.call_count == 2 - assert len(docs) + mock_warning.call_count == len(texts) - assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] + nlp.set_error_handler(warn_error) + logger = logging.getLogger("spacy") + with mock.patch.object(logger, "warning") as mock_warning: + # the errors by the evil custom component raise a warning for each + # bad doc + docs = list(nlp.pipe(texts, n_process=n_process)) + # HACK/TODO? the warnings in child processes don't seem to be + # detected by the mock logger + if n_process == 1: + mock_warning.assert_called() + assert mock_warning.call_count == 2 + assert len(docs) + mock_warning.call_count == len(texts) + assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] -def test_language_pipe_error_handler_pipe(en_vocab): +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_pipe(en_vocab, n_process): """Test the error handling of a component's pipe method""" + Language.component("my_perhaps_sentences", func=perhaps_set_sentences) + Language.component("assert_sents_error", func=assert_sents_error) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + texts = [f"{str(i)} is enough. Done" for i in range(100)] + nlp = English() + nlp.add_pipe("my_perhaps_sentences") + nlp.add_pipe("assert_sents_error") + nlp.initialize() + with pytest.raises(ValueError): + # assert_sents_error requires sentence boundaries, will throw an error otherwise + docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) + nlp.set_error_handler(ignore_error) + docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) + # we lose/ignore the failing 4,40-49 docs + assert len(docs) == 89 - @Language.component("my_sentences") - def perhaps_set_sentences(doc): - if not doc.text.startswith("4"): - doc[-1].is_sent_start = True - return doc - texts = [f"{str(i)} is enough. Done" for i in range(100)] - nlp = English() - nlp.add_pipe("my_sentences") - entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 3}) - entity_linker.kb.add_entity(entity="Q1", freq=12, entity_vector=[1, 2, 3]) - nlp.initialize() - with pytest.raises(ValueError): - # the entity linker requires sentence boundaries, will throw an error otherwise - docs = list(nlp.pipe(texts, batch_size=10)) - nlp.set_error_handler(ignore_error) - docs = list(nlp.pipe(texts, batch_size=10)) - # we lose/ignore the failing 0-9 and 40-49 batches - assert len(docs) == 80 +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_make_doc_actual(n_process): + """Test the error handling for make_doc""" + # TODO: fix so that the following test is the actual behavior + + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.max_length = 10 + texts = ["12345678901234567890", "12345"] * 10 + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.default_error_handler = ignore_error + if n_process == 1: + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + else: + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 + + +@pytest.mark.xfail +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_make_doc_preferred(n_process): + """Test the error handling for make_doc""" + + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.max_length = 10 + texts = ["12345678901234567890", "12345"] * 10 + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.default_error_handler = ignore_error + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 def test_language_from_config_before_after_init(): @@ -363,6 +421,37 @@ def test_language_from_config_before_after_init_invalid(): English.from_config(config) +def test_language_whitespace_tokenizer(): + """Test the custom whitespace tokenizer from the docs.""" + + class WhitespaceTokenizer: + def __init__(self, vocab): + self.vocab = vocab + + def __call__(self, text): + words = text.split(" ") + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) + + nlp = spacy.blank("en") + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) + text = " What's happened to me? he thought. It wasn't a dream. " + doc = nlp(text) + assert doc.text == text + + def test_language_custom_tokenizer(): """Test that a fully custom tokenizer can be plugged in via the registry.""" name = "test_language_custom_tokenizer" @@ -419,3 +508,23 @@ def test_language_init_invalid_vocab(value): with pytest.raises(ValueError) as e: Language(value) assert err_fragment in str(e.value) + + +def test_language_source_and_vectors(nlp2): + nlp = Language(Vocab()) + textcat = nlp.add_pipe("textcat") + for label in ("POSITIVE", "NEGATIVE"): + textcat.add_label(label) + nlp.initialize() + long_string = "thisisalongstring" + assert long_string not in nlp.vocab.strings + assert long_string not in nlp2.vocab.strings + nlp.vocab.strings.add(long_string) + assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes() + vectors_bytes = nlp.vocab.vectors.to_bytes() + with pytest.warns(UserWarning): + nlp2.add_pipe("textcat", name="textcat2", source=nlp) + # strings should be added + assert long_string in nlp2.vocab.strings + # vectors should remain unmodified + assert nlp.vocab.vectors.to_bytes() == vectors_bytes diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 0d09999a9..45cbdf45b 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -8,7 +8,8 @@ from spacy import prefer_gpu, require_gpu, require_cpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.util import dot_to_object, SimpleFrozenList, import_file -from thinc.api import Config, Optimizer, ConfigValidationError, get_current_ops +from spacy.util import to_ternary_int +from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import set_current_ops from spacy.training.batchers import minibatch_by_words from spacy.lang.en import English @@ -274,7 +275,7 @@ def test_util_minibatch(doc_sizes, expected_batches): ], ) def test_util_minibatch_oversize(doc_sizes, expected_batches): - """ Test that oversized documents are returned in their own batch""" + """Test that oversized documents are returned in their own batch""" docs = [get_random_doc(doc_size) for doc_size in doc_sizes] tol = 0.2 batch_size = 1000 @@ -296,7 +297,7 @@ def test_util_dot_section(): factory = "textcat" [components.textcat.model] - @architectures = "spacy.TextCatBOW.v1" + @architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -386,3 +387,18 @@ def make_dummy_component( nlp = English.from_config(config) nlp.add_pipe("dummy_component") nlp.initialize() + + +def test_to_ternary_int(): + assert to_ternary_int(True) == 1 + assert to_ternary_int(None) == 0 + assert to_ternary_int(False) == -1 + assert to_ternary_int(1) == 1 + assert to_ternary_int(1.0) == 1 + assert to_ternary_int(0) == 0 + assert to_ternary_int(0.0) == 0 + assert to_ternary_int(-1) == -1 + assert to_ternary_int(5) == -1 + assert to_ternary_int(-10) == -1 + assert to_ternary_int("string") == -1 + assert to_ternary_int([0, "string"]) == -1 diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 45cee13ea..47540198a 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -1,11 +1,14 @@ from typing import List import pytest from thinc.api import fix_random_seed, Adam, set_dropout_rate +from thinc.api import Ragged, reduce_mean, Logistic, chain, Relu from numpy.testing import assert_array_equal, assert_array_almost_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier +from spacy.ml.models import build_spancat_model from spacy.ml.staticvectors import StaticVectors +from spacy.ml.extract_spans import extract_spans, _get_span_indices from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES @@ -205,3 +208,63 @@ def test_empty_docs(model_func, kwargs): # Test backprop output, backprop = model.begin_update(docs) backprop(output) + + +def test_init_extract_spans(): + extract_spans().initialize() + + +def test_extract_spans_span_indices(): + model = extract_spans().initialize() + spans = Ragged( + model.ops.asarray([[0, 3], [2, 3], [5, 7]], dtype="i"), + model.ops.asarray([2, 1], dtype="i"), + ) + x_lengths = model.ops.asarray([5, 10], dtype="i") + indices = _get_span_indices(model.ops, spans, x_lengths) + assert list(indices) == [0, 1, 2, 2, 10, 11] + + +def test_extract_spans_forward_backward(): + model = extract_spans().initialize() + X = Ragged(model.ops.alloc2f(15, 4), model.ops.asarray([5, 10], dtype="i")) + spans = Ragged( + model.ops.asarray([[0, 3], [2, 3], [5, 7]], dtype="i"), + model.ops.asarray([2, 1], dtype="i"), + ) + Y, backprop = model.begin_update((X, spans)) + assert list(Y.lengths) == [3, 1, 2] + assert Y.dataXd.shape == (6, 4) + dX, spans2 = backprop(Y) + assert spans2 is spans + assert dX.dataXd.shape == X.dataXd.shape + assert list(dX.lengths) == list(X.lengths) + + +def test_spancat_model_init(): + model = build_spancat_model( + build_Tok2Vec_model(**get_tok2vec_kwargs()), reduce_mean(), Logistic() + ) + model.initialize() + + +def test_spancat_model_forward_backward(nO=5): + tok2vec = build_Tok2Vec_model(**get_tok2vec_kwargs()) + docs = get_docs() + spans_list = [] + lengths = [] + for doc in docs: + spans_list.append(doc[:2]) + spans_list.append(doc[1:4]) + lengths.append(2) + spans = Ragged( + tok2vec.ops.asarray([[s.start, s.end] for s in spans_list], dtype="i"), + tok2vec.ops.asarray(lengths, dtype="i"), + ) + model = build_spancat_model( + tok2vec, reduce_mean(), chain(Relu(nO=nO), Logistic()) + ).initialize(X=(docs, spans)) + + Y, backprop = model((docs, spans), is_train=True) + assert Y.shape == (spans.dataXd.shape[0], nO) + backprop(Y) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index c044d8afe..16cc97f6d 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -444,7 +444,9 @@ def test_score_spans(): assert f"{key}_per_type" in scores # Discard labels from the evaluation - scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False) + scores = Scorer.score_spans( + [eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False + ) assert scores[f"{key}_p"] == 1.0 assert scores[f"{key}_r"] == 1.0 assert f"{key}_per_type" not in scores @@ -467,4 +469,6 @@ def test_prf_score(): assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333)) a += b - assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore)) \ No newline at end of file + assert (a.precision, a.recall, a.fscore) == approx( + (c.precision, c.recall, c.fscore) + ) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c1ba1df36..7d0c16745 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -209,10 +209,6 @@ def test_tokenizer_flush_specials(en_vocab): suffix_search=suffix_re.search, rules=rules, ) - tokenizer2 = Tokenizer( - en_vocab, - suffix_search=suffix_re.search, - ) assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] tokenizer1.rules = {} assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."] diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index ba58ea96d..4dd90f416 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -182,6 +182,27 @@ def test_Example_from_dict_with_entities(annots): assert example.reference[5].ent_type_ == "LOC" +def test_Example_from_dict_with_empty_entities(): + annots = { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [], + } + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + # entities as empty list sets everything to O + assert example.reference.has_annotation("ENT_IOB") + assert len(list(example.reference.ents)) == 0 + assert all(token.ent_iob_ == "O" for token in example.reference) + # various unset/missing entities leaves entities unset + annots["entities"] = None + example = Example.from_dict(predicted, annots) + assert not example.reference.has_annotation("ENT_IOB") + annots.pop("entities", None) + example = Example.from_dict(predicted, annots) + assert not example.reference.has_annotation("ENT_IOB") + + @pytest.mark.parametrize( "annots", [ diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index bd8810a5c..8ee54b544 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -278,7 +278,9 @@ def test_pretraining_training(): filled = filled.interpolate() P = filled["pretraining"] nlp_base = init_nlp(filled) - model_base = nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") + model_base = ( + nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") + ) embed_base = None for node in model_base.walk(): if node.name == "hashembed": @@ -331,11 +333,12 @@ def write_sample_training(tmp_dir): def write_vectors_model(tmp_dir): import numpy + vocab = Vocab() vector_data = { "dog": numpy.random.uniform(-1, 1, (300,)), "cat": numpy.random.uniform(-1, 1, (300,)), - "orange": numpy.random.uniform(-1, 1, (300,)) + "orange": numpy.random.uniform(-1, 1, (300,)), } for word, vector in vector_data.items(): vocab.set_vector(word, vector) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 321c08c1e..cd428be15 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): def test_gold_biluo_4791(en_vocab, en_tokenizer): - doc = en_tokenizer("I'll return the ₹54 amount") - gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + doc = en_tokenizer("I'll return the A54 amount") + gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] example = Example.from_dict( @@ -434,8 +434,14 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer): gold_doc = nlp.make_doc(text) spans = [] prefix = "I flew to " - spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY")) - spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY")) + spans.append( + gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY") + ) + spans.append( + gold_doc.char_span( + len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY" + ) + ) spans_key = "overlap_ents" gold_doc.spans[spans_key] = spans example = Example(doc, gold_doc) @@ -443,7 +449,9 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer): assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)] # Ensure that 'get_aligned_spans_y2x' has the aligned entities correct - spans_y2x_no_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=False) + spans_y2x_no_overlap = example.get_aligned_spans_y2x( + spans_gold, allow_overlap=False + ) assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)] spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True) assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)] diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 37d48ad0f..8a7dd22c3 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -12,6 +12,7 @@ from ..util import add_vecs_to_vocab, get_cosine, make_tempdir OPS = get_current_ops() + @pytest.fixture def strings(): return ["apple", "orange"] diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index a687059be..56ef1d108 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,5 +1,5 @@ import pytest -from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.attrs import LEMMA, ORTH, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB @@ -30,7 +30,6 @@ def test_vocab_api_shape_attr(en_vocab, text): ("VERB", VERB), ("LEMMA", LEMMA), ("ORTH", ORTH), - ("PROB", PROB), ], ) def test_vocab_api_symbols(en_vocab, string, symbol): diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d5b4e4ff7..868eb3eab 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -103,10 +103,12 @@ class DocBin: self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) + self.strings.add(token.norm_) self.strings.add(str(token.morph)) self.strings.add(token.dep_) self.strings.add(token.ent_type_) self.strings.add(token.ent_kb_id_) + self.strings.add(token.ent_id_) self.cats.append(doc.cats) self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.span_groups.append(doc.spans.to_bytes()) @@ -244,7 +246,10 @@ class DocBin: """ path = ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes()) + try: + file_.write(self.to_bytes()) + except ValueError: + raise ValueError(Errors.E870) def from_disk(self, path: Union[str, Path]) -> "DocBin": """Load the DocBin from a file (typically called .spacy). diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 28f8debf3..cd2bd6f6c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -275,6 +275,8 @@ cdef class Doc: deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] if deps and not heads: heads = [0] * len(deps) + if heads and not deps: + raise ValueError(Errors.E1017) if sent_starts is not None: for i in range(len(sent_starts)): if sent_starts[i] is True: @@ -1139,6 +1141,10 @@ cdef class Doc: else: warnings.warn(Warnings.W102.format(key=key, value=value)) for key in doc.spans: + # if a spans key is in any doc, include it in the merged doc + # even if it is empty + if key not in concat_spans: + concat_spans[key] = [] for span in doc.spans[key]: concat_spans[key].append(( span.start_char + char_offset, @@ -1148,7 +1154,7 @@ cdef class Doc: span.text, # included as a check )) char_offset += len(doc.text) - if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: + if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] @@ -1158,11 +1164,12 @@ cdef class Doc: for i, array in enumerate(arrays[:-1]): if len(array) > 0 and not docs[i][-1].is_space: array[-1][spacy_index] = 1 - token_offset = -1 - for doc in docs[:-1]: - token_offset += len(doc) - if not (len(doc) > 0 and doc[-1].is_space): - concat_spaces[token_offset] = True + if len(concat_spaces) > 0: + token_offset = -1 + for doc in docs[:-1]: + token_offset += len(doc) + if not (len(doc) > 0 and doc[-1].is_space): + concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays) @@ -1672,7 +1679,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): j_idx_in_sent = start + j - sent_start n_missing_tokens_in_sent = len(sent) - j_idx_in_sent # make sure we do not go past `end`, in cases where `end` < sent.end - max_range = min(j + n_missing_tokens_in_sent, end) + max_range = min(j + n_missing_tokens_in_sent, end - start) for k in range(j + 1, max_range): lca = _get_tokens_lca(token_j, doc[start + k]) # if lca is outside of span, we set it to -1 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 614d8fda5..093b2a4da 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -228,7 +228,25 @@ cdef class Span: array = self.doc.to_array(array_head) array = array[self.start : self.end] self._fix_dep_copy(array_head, array) + # Fix initial IOB so the entities are valid for doc.ents below. + if len(array) > 0 and ENT_IOB in array_head: + ent_iob_col = array_head.index(ENT_IOB) + if array[0][ent_iob_col] == 1: + array[0][ent_iob_col] = 3 doc.from_array(array_head, array) + # Set partial entities at the beginning or end of the span to have + # missing entity annotation. Note: the initial partial entity could be + # detected from the IOB annotation but the final partial entity can't, + # so detect and remove both in the same way by checking self.ents. + span_ents = {(ent.start, ent.end) for ent in self.ents} + doc_ents = doc.ents + if len(doc_ents) > 0: + # Remove initial partial ent + if (doc_ents[0].start + self.start, doc_ents[0].end + self.start) not in span_ents: + doc.set_ents([], missing=[doc_ents[0]], default="unmodified") + # Remove final partial ent + if (doc_ents[-1].start + self.start, doc_ents[-1].end + self.start) not in span_ents: + doc.set_ents([], missing=[doc_ents[-1]], default="unmodified") doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.user_hooks = self.doc.user_hooks doc.user_span_hooks = self.doc.user_span_hooks @@ -722,7 +740,7 @@ cdef class Span: def __get__(self): return self.root.ent_id_ - def __set__(self, hash_t key): + def __set__(self, unicode key): raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property @@ -736,7 +754,7 @@ cdef class Span: @property def lemma_(self): """RETURNS (str): The span's lemma.""" - return " ".join([t.lemma_ for t in self]).strip() + return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() property label_: """RETURNS (str): The span's label.""" @@ -744,9 +762,7 @@ cdef class Span: return self.doc.vocab.strings[self.label] def __set__(self, unicode label_): - if not label_: - label_ = '' - raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) + self.label = self.doc.vocab.strings.add(label_) property kb_id_: """RETURNS (str): The named entity's KB ID.""" @@ -754,13 +770,7 @@ cdef class Span: return self.doc.vocab.strings[self.kb_id] def __set__(self, unicode kb_id_): - if not kb_id_: - kb_id_ = '' - current_label = self.label_ - if not current_label: - current_label = '' - raise NotImplementedError(Errors.E131.format(start=self.start, end=self.end, - label=current_label, kb_id=kb_id_)) + self.kb_id = self.doc.vocab.strings.add(kb_id_) cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 605108a7b..3fcfda691 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -329,7 +329,7 @@ cdef class Token: @property def shape(self): """RETURNS (uint64): ID of the token's shape, a transform of the - tokens's string, to show orthographic features (e.g. "Xxxx", "dd"). + token's string, to show orthographic features (e.g. "Xxxx", "dd"). """ return self.c.lex.shape @@ -825,7 +825,7 @@ cdef class Token: @property def shape_(self): - """RETURNS (str): Transform of the tokens's string, to show + """RETURNS (str): Transform of the token's string, to show orthographic features. For example, "Xxxx" or "dd". """ return self.vocab.strings[self.c.lex.shape] diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index c54242eae..e79ba79b0 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -66,7 +66,11 @@ def configure_minibatch_by_words( """ optionals = {"get_length": get_length} if get_length is not None else {} return partial( - minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals + minibatch_by_words, + size=size, + tolerance=tolerance, + discard_oversize=discard_oversize, + **optionals ) diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 356021a1d..66156b6e5 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -69,7 +69,7 @@ def read_conllx( ner_tag_pattern="", ner_map=None, ): - """ Yield docs, one for each sentence """ + """Yield docs, one for each sentence""" vocab = Vocab() # need vocab to make a minimal Doc for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 063d80a95..606dbfb4a 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -41,7 +41,7 @@ def create_docbin_reader( @util.registry.readers("spacy.JsonlCorpus.v1") def create_jsonl_reader( - path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0 + path: Optional[Path], min_length: int = 0, max_length: int = 0, limit: int = 0 ) -> Callable[["Language"], Iterable[Doc]]: return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit) @@ -186,7 +186,7 @@ class Corpus: def read_docbin( self, vocab: Vocab, locs: Iterable[Union[str, Path]] ) -> Iterator[Doc]: - """ Yield training examples as example dicts """ + """Yield training examples as example dicts""" i = 0 for loc in locs: loc = util.ensure_path(loc) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 07a83bfec..732203e7b 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -235,9 +235,9 @@ cdef class Example: seen.update(indices) return output - def get_aligned_ner(self): + def get_aligned_ents_and_ner(self): if not self.y.has_annotation("ENT_IOB"): - return [None] * len(self.x) # should this be 'missing' instead of 'None' ? + return [], [None] * len(self.x) x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False) # Default to 'None' for missing values x_tags = offsets_to_biluo_tags( @@ -253,6 +253,10 @@ cdef class Example: x_tags[i] = "O" elif self.x[i].is_space: x_tags[i] = "O" + return x_ents, x_tags + + def get_aligned_ner(self): + x_ents, x_tags = self.get_aligned_ents_and_ner() return x_tags def to_dict(self): @@ -416,7 +420,7 @@ def _fix_legacy_dict_data(example_dict): token_dict = example_dict.get("token_annotation", {}) doc_dict = example_dict.get("doc_annotation", {}) for key, value in example_dict.items(): - if value: + if value is not None: if key in ("token_annotation", "doc_annotation"): pass elif key == "ids": diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 36384d67b..04d030964 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -9,6 +9,7 @@ import gzip import zipfile import tqdm from itertools import islice +import warnings from .pretrain import get_tok2vec_ref from ..lookups import Lookups @@ -70,18 +71,26 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): if T["max_epochs"] == -1: - logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels") - nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer) + sample_size = 100 + logger.debug( + f"Due to streamed train corpus, using only first {sample_size} " + f"examples for initialization. If necessary, provide all labels " + f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + ) + nlp.initialize( + lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer + ) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: - for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer + for listener in getattr( + proc, "listening_components", [] + ): # e.g. tok2vec/transformer # Don't warn about components not in the pipeline if listener not in nlp.pipe_names: continue - if listener in frozen_components and name not in frozen_components: logger.warning(Warnings.W087.format(name=name, listener=listener)) # We always check this regardless, in case user freezes tok2vec @@ -120,6 +129,12 @@ def init_vocab( if vectors is not None: load_vectors_into_model(nlp, vectors) logger.info(f"Added vectors: {vectors}") + # warn if source model vectors are not identical + sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) + vectors_hash = hash(nlp.vocab.vectors.to_bytes()) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object") @@ -143,6 +158,8 @@ def load_vectors_into_model( logger.warning(Warnings.W112.format(name=name)) nlp.vocab.vectors = vectors_nlp.vocab.vectors + for lex in nlp.vocab: + lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) if add_strings: # I guess we should add the strings from the vectors_nlp model? # E.g. if someone does a similarity query, they might expect the strings. diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 0e8e7eed0..42dae8fc4 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -71,6 +71,8 @@ def offsets_to_biluo_tags( entities (iterable): A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. + missing (str): The label used for missing values, e.g. if tokenization + doesn’t align with the entity offsets. Defaults to "O". RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". The missing label is used where the @@ -150,7 +152,7 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: to overwrite the doc.ents. doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one + tags (iterable): A sequence of BILUO tags with each tag describing one token. Each tag string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". RETURNS (list): A sequence of Span objects. Each token with a missing IOB @@ -170,7 +172,7 @@ def biluo_tags_to_offsets( """Encode per-token tags following the BILUO scheme into entity offsets. doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one + tags (iterable): A sequence of BILUO tags with each tag describing one token. Each tags string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". RETURNS (list): A sequence of `(start, end, label)` triples. `start` and diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index ef6c86044..f7f70226d 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -110,7 +110,9 @@ def wandb_logger( ): try: import wandb - from wandb import init, log, join # test that these are available + + # test that these are available + from wandb import init, log, join # noqa: F401 except ImportError: raise ImportError(Errors.E880) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 85aa458f0..09c54fc9f 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,4 +1,4 @@ -from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO +from typing import List, Callable, Tuple, Dict, Iterable, Union, Any, IO from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer @@ -96,8 +96,7 @@ def train( stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") if annotating_components: stdout.write( - msg.info(f"Set annotations on update for: {annotating_components}") - + "\n" + msg.info(f"Set annotations on update for: {annotating_components}") + "\n" ) stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): diff --git a/spacy/util.py b/spacy/util.py index 0166bd190..421287ce2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -57,13 +57,13 @@ if TYPE_CHECKING: from .vocab import Vocab # noqa: F401 +# fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. -# fmt: off CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] # fmt: on @@ -387,9 +387,10 @@ def load_model_from_path( if not meta: meta = get_model_meta(model_path) config_path = model_path / "config.cfg" - config = load_config(config_path, overrides=dict_to_dot(config)) + overrides = dict_to_dot(config) + config = load_config(config_path, overrides=overrides) nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude) - return nlp.from_disk(model_path, exclude=exclude) + return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) def load_model_from_config( @@ -647,6 +648,18 @@ def get_model_version_range(spacy_version: str) -> str: return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" +def get_model_lower_version(constraint: str) -> Optional[str]: + """From a version range like >=1.2.3,<1.3.0 return the lower pin.""" + try: + specset = SpecifierSet(constraint) + for spec in specset: + if spec.operator in (">=", "==", "~="): + return spec.version + except Exception: + pass + return None + + def get_base_version(version: str) -> str: """Generate the base version without any prerelease identifiers. @@ -700,10 +713,18 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: raise ValueError(Errors.E054.format(setting=setting)) if "spacy_version" in meta: if not is_compatible_version(about.__version__, meta["spacy_version"]): + lower_version = get_model_lower_version(meta["spacy_version"]) + lower_version = get_minor_version(lower_version) + if lower_version is not None: + lower_version = "v" + lower_version + elif "spacy_git_version" in meta: + lower_version = "git commit " + meta["spacy_git_version"] + else: + lower_version = "version unknown" warn_msg = Warnings.W095.format( model=f"{meta['lang']}_{meta['name']}", model_version=meta["version"], - version=meta["spacy_version"], + version=lower_version, current=about.__version__, ) warnings.warn(warn_msg) @@ -1372,7 +1393,8 @@ def combine_score_weights( # We divide each weight by the total weight sum. # We first need to extract all None/null values for score weights that # shouldn't be shown in the table *or* be weighted - result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()} + result = {key: value for w_dict in weights for (key, value) in w_dict.items()} + result.update(overrides) weight_sum = sum([v if v else 0.0 for v in result.values()]) for key, value in result.items(): if value and weight_sum > 0: @@ -1515,11 +1537,15 @@ def to_ternary_int(val) -> int: attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0 (None), any other values are -1 (False). """ - if isinstance(val, float): - val = int(val) - if val is True or val is 1: + if val is True: return 1 - elif val is None or val is 0: + elif val is None: + return 0 + elif val is False: + return -1 + elif val == 1: + return 1 + elif val == 0: return 0 else: return -1 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ee440898a..13dd675af 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -364,15 +364,15 @@ cdef class Vocab: word = self[orth].orth_ if orth in self.vectors.key2row: return self.vectors[orth] - # Assign default ngram limits to minn and maxn which is the length of the word. - if minn is None: - minn = len(word) - if maxn is None: - maxn = len(word) xp = get_array_module(self.vectors.data) vectors = xp.zeros((self.vectors_length,), dtype="f") + if minn is None: + return vectors # Fasttext's ngram computation taken from # https://github.com/facebookresearch/fastText + # Assign default ngram limit to maxn which is the length of the word. + if maxn is None: + maxn = len(word) ngrams_size = 0; for i in range(len(word)): ngram = "" diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index e09352ec9..e90dc1183 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -9,6 +9,7 @@ menu: - ['Parser & NER', 'parser'] - ['Tagging', 'tagger'] - ['Text Classification', 'textcat'] + - ['Span Classification', 'spancat'] - ['Entity Linking', 'entitylinker'] --- @@ -284,8 +285,8 @@ Encode context using bidirectional LSTM layers. Requires Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a learned linear projection to control the dimensionality. Unknown tokens are -mapped to a zero vector. See the documentation on [static -vectors](/usage/embeddings-transformers#static-vectors) for details. +mapped to a zero vector. See the documentation on +[static vectors](/usage/embeddings-transformers#static-vectors) for details. | Name |  Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -448,7 +449,7 @@ For more information, see the section on > ```ini > [pretraining] > component = "tok2vec" -> +> > [initialize] > vectors = "en_core_web_lg" > ... @@ -461,8 +462,8 @@ For more information, see the section on > ``` Predict the word's vector from a static embeddings table as pretraining -objective for a Tok2Vec layer. To use this objective, make sure that the -`initialize.vectors` section in the config refers to a model with static +objective for a Tok2Vec layer. To use this objective, make sure that the +`initialize.vectors` section in the config refers to a model with static vectors. | Name | Description | @@ -552,6 +553,13 @@ consists of either two or three subnetworks: | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | + + +[TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact same signature, +but the `use_upper` argument was `True` by default. + + + ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} ### spacy.Tagger.v1 {#Tagger} @@ -611,7 +619,7 @@ single-label use-cases where `exclusive_classes = true`, while the > nO = null > > [model.linear_model] -> @architectures = "spacy.TextCatBOW.v1" +> @architectures = "spacy.TextCatBOW.v2" > exclusive_classes = true > ngram_size = 1 > no_output_layer = false @@ -648,8 +656,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`. -[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar, but used an internal `tok2vec` instead of -taking it as argument: +[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar, +but used an internal `tok2vec` instead of taking it as argument: | Name | Description | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -666,13 +674,13 @@ taking it as argument: -### spacy.TextCatCNN.v1 {#TextCatCNN} +### spacy.TextCatCNN.v2 {#TextCatCNN} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatCNN.v1" +> @architectures = "spacy.TextCatCNN.v2" > exclusive_classes = false > nO = null > @@ -698,13 +706,21 @@ architecture is usually less accurate than the ensemble, but runs faster. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.TextCatBOW.v1 {#TextCatBOW} + + +[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was +not yet resizable. Since v2, new labels can be added to this component, even +after training. + + + +### spacy.TextCatBOW.v2 {#TextCatBOW} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatBOW.v1" +> @architectures = "spacy.TextCatBOW.v2" > exclusive_classes = false > ngram_size = 1 > no_output_layer = false @@ -722,6 +738,62 @@ the others, but may not be as accurate, especially if texts are short. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + +[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was +not yet resizable. Since v2, new labels can be added to this component, even +after training. + + + +## Span classification architectures {#spancat source="spacy/ml/models/spancat.py"} + +### spacy.SpanCategorizer.v1 {#SpanCategorizer} + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.SpanCategorizer.v1" +> scorer = {"@layers": "spacy.LinearLogistic.v1"} +> +> [model.reducer] +> @layers = spacy.mean_max_reducer.v1" +> hidden_size = 128 +> +> [model.tok2vec] +> @architectures = "spacy.Tok2Vec.v1" +> +> [model.tok2vec.embed] +> @architectures = "spacy.MultiHashEmbed.v1" +> # ... +> +> [model.tok2vec.encode] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> # ... +> ``` + +Build a span categorizer model to power a +[`SpanCategorizer`](/api/spancategorizer) component, given a token-to-vector +model, a reducer model to map the sequence of vectors for each span down to a +single vector, and a scorer model to map the vectors to probabilities. + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------- | +| `tok2vec` | The token-to-vector model. ~~Model[List[Doc], List[Floats2d]]~~ | +| `reducer` | The reducer model. ~~Model[Ragged, Floats2d]~~ | +| `scorer` | The scorer model. ~~Model[Floats2d, Floats2d]~~ | +| **CREATES** | The model using the architecture. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | + +### spacy.mean_max_reducer.v1 {#mean_max_reducer} + +Reduce sequences by concatenating their mean and max pooled vectors, and then +combine the concatenated vectors with a hidden layer. + +| Name | Description | +| ------------- | ------------------------------------- | +| `hidden_size` | The size of the hidden layer. ~~int~~ | + ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 196e47543..10ab2083e 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -16,6 +16,7 @@ menu: - ['package', 'package'] - ['project', 'project'] - ['ray', 'ray'] + - ['huggingface-hub', 'huggingface-hub'] --- spaCy's CLI provides a range of helpful commands for downloading and training @@ -768,6 +769,7 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P | `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **PRINTS** | Debugging information. | ## train {#train tag="command"} @@ -932,7 +934,10 @@ copied into the package and imported in the `__init__.py`. If the path to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in the input directory, this file is used. Otherwise, the data can be entered directly from the command line. spaCy will then create a build artifact that you -can distribute and install with `pip install`. +can distribute and install with `pip install`. As of v3.1, the `package` command +will also create a formatted `README.md` based on the pipeline information +defined in the `meta.json`. If a `README.md` is already present in the source +directory, it will be used instead. @@ -1272,3 +1277,49 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a | `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | + +## huggingface-hub {#huggingface-hub new="3.1"} + +The `spacy huggingface-cli` CLI includes commands for uploading your trained +spaCy pipelines to the [Hugging Face Hub](https://huggingface.co/). + +> #### Installation +> +> ```cli +> $ pip install spacy-huggingface-hub +> $ huggingface-cli login +> ``` + + + +To use this command, you need the +[`spacy-huggingface-hub`](https://github.com/explosion/spacy-huggingface-hub) +package installed. Installing the package will automatically add the +`huggingface-hub` command to the spaCy CLI. + + + +### huggingface-hub push {#huggingface-hub-push tag="command"} + +Push a spaCy pipeline to the Hugging Face Hub. Expects a `.whl` file packaged +with [`spacy package`](/api/cli#package) and `--build wheel`. For more details, +see the spaCy project [integration](/usage/projects#huggingface_hub). + +```cli +$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose] +``` + +> #### Example +> +> ```cli +> $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl +> ``` + +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | +| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | +| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | +| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ | +| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~  | +| **UPLOADS** | The pipeline to the hub. | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 576ab8394..7dbf50595 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -93,7 +93,7 @@ Defines the `nlp` object, its tokenizer and > labels = ["POSITIVE", "NEGATIVE"] > > [components.textcat.model] -> @architectures = "spacy.TextCatBOW.v1" +> @architectures = "spacy.TextCatBOW.v2" > exclusive_classes = true > ngram_size = 1 > no_output_layer = false @@ -297,7 +297,7 @@ objects to JSON, you can now serialize them directly using the format: ```cli -$ python -m spacy convert ./data.json ./output.spacy +$ python -m spacy convert ./data.json . ``` @@ -589,7 +589,7 @@ source of truth** used for loading a pipeline. | `vectors` | Information about the word vectors included with the pipeline. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ | | `pipeline` | Names of pipeline component names, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ | | `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ | -| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| `performance` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | `speed` | Inference speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ | | `spacy_git_version` 3 | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create pipeline. ~~str~~ | | other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index c4e2e1697..fa02a6f99 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ | @@ -88,8 +88,8 @@ shortcut for this and instantiate the component using its string name and | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | | _keyword-only_ | | -| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | -| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | ## DependencyParser.\_\_call\_\_ {#call tag="method"} @@ -231,14 +231,14 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and > losses = parser.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## DependencyParser.get_loss {#get_loss tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b3a1054fc..2994d934b 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -213,10 +213,10 @@ if there is no prediction. > kb_ids = entity_linker.predict([doc1, doc2]) > ``` -| Name | Description | -| ----------- | ------------------------------------------- | -| `docs` | The documents to predict. ~~Iterable[Doc]~~ | -| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | +| Name | Description | +| ----------- | -------------------------------------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | ## EntityLinker.set_annotations {#set_annotations tag="method"} @@ -341,6 +341,42 @@ Load the pipe from disk. Modifies the object in place and returns it. | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | | **RETURNS** | The modified `EntityLinker` object. ~~EntityLinker~~ | +## EntityLinker.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker_bytes = entity_linker.to_bytes() +> ``` + +Serialize the pipe to a bytestring, including the `KnowledgeBase`. + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `EntityLinker` object. ~~bytes~~ | + +## EntityLinker.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> entity_linker_bytes = entity_linker.to_bytes() +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.from_bytes(entity_linker_bytes) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `EntityLinker` object. ~~EntityLinker~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 348736209..601b644c1 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -37,6 +37,7 @@ architectures and their arguments and hyperparameters. > "moves": None, > "update_with_oracle_cut_size": 100, > "model": DEFAULT_NER_MODEL, +> "incorrect_spans_key": "incorrect_spans", > } > nlp.add_pipe("ner", config=config) > ``` @@ -46,6 +47,7 @@ architectures and their arguments and hyperparameters. | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | +| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/ner.pyx @@ -72,14 +74,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | -| _keyword-only_ | | -| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | +| Name | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~ | +| _keyword-only_ | | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `incorrect_spans_key` | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group in [`Doc.spans`](/api/doc#spans), under this key. Defaults to `None`. ~~Optional[str]~~ | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} @@ -220,14 +223,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and > losses = ner.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityRecognizer.get_loss {#get_loss tag="method"} diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 76a4b3604..66cb6d4e4 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -35,11 +35,11 @@ how the component should be configured. You can override its settings via the > ``` | Setting | Description | -| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `" | | "`. ~~str~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entityruler.py @@ -64,14 +64,14 @@ be a token pattern (list) or a phrase pattern (string). For example: > ``` | Name | Description | -| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- | | `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | | `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | | _keyword-only_ | | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `" | | "`. ~~str~~ | | `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | ## EntityRuler.initialize {#initialize tag="method" new="3"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index 3cbc5dbd8..e7a8fcd6f 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -245,8 +245,8 @@ certain prior probability. ### Candidate.\_\_init\_\_ {#candidate-init tag="method"} Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the -`get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe. +but instead these objects are returned by the `get_candidates` method of the +[`entity_linker`](/api/entitylinker) pipe. > #### Example > diff --git a/website/docs/api/language.md b/website/docs/api/language.md index ca87cbb16..b09ae1aa2 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -426,7 +426,8 @@ component, adds it to the pipeline and returns it. > ```python > @Language.component("component") > def component_func(doc): -> # modify Doc and return it return doc +> # modify Doc and return it +> return doc > > nlp.add_pipe("component", before="ner") > component = nlp.add_pipe("component", name="custom_name", last=True) diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index 96bc199bf..02b376780 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -103,6 +103,11 @@ and residual connections. | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | +### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} + +Identical to [`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) +except the `use_upper` was set to `True` by default. + ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1} The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and @@ -176,6 +181,70 @@ added to an existing vectors table. See more details in +### spacy.TextCatCNN.v1 {#TextCatCNN_v1} + +Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means +that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not +yet support that. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatCNN.v1" +> exclusive_classes = false +> nO = null +> +> [model.tok2vec] +> @architectures = "spacy.HashEmbedCNN.v1" +> pretrained_vectors = null +> width = 96 +> depth = 4 +> embed_size = 2000 +> window_size = 1 +> maxout_pieces = 3 +> subword_features = true +> ``` + +A neural network model where token vectors are calculated using a CNN. The +vectors are mean pooled and used as features in a feed-forward network. This +architecture is usually less accurate than the ensemble, but runs faster. + +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + +### spacy.TextCatBOW.v1 {#TextCatBOW_v1} + +Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means +that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not +yet support that. + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.TextCatBOW.v1" +> exclusive_classes = false +> ngram_size = 1 +> no_output_layer = false +> nO = null +> ``` + +An n-gram "bag-of-words" model. This architecture should run much faster than +the others, but may not be as accurate, especially if texts are short. + +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | +| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + ## Loggers {#loggers} These functions are available from `@spacy.registry.loggers`. diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f186535f7..279821e71 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -4,7 +4,6 @@ tag: class source: spacy/pipeline/lemmatizer.py new: 3 teaser: 'Pipeline component for lemmatization' -api_base_class: /api/pipe api_string_name: lemmatizer api_trainable: false --- @@ -48,11 +47,36 @@ data format used by the lookup and rule-based lemmatizers, see > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Description | -| ----------- | --------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | -| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | -| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| Setting | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | + +Many languages specify a default lemmatizer mode other than `lookup` if a better +lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require +[`token.pos`](/api/token) from a previous pipeline component (see example +pipeline configurations in the +[pretrained pipeline design details](/models#design-cnn)) or rely on third-party +libraries (`pymorphy2`). + +| Language | Default Mode | +| -------- | ------------ | +| `bn` | `rule` | +| `ca` | `pos_lookup` | +| `el` | `rule` | +| `en` | `rule` | +| `es` | `rule` | +| `fa` | `rule` | +| `fr` | `rule` | +| `it` | `pos_lookup` | +| `mk` | `rule` | +| `nb` | `rule` | +| `nl` | `rule` | +| `pl` | `pos_lookup` | +| `ru` | `pymorphy2` | +| `sv` | `rule` | +| `uk` | `pymorphy2` | ```python %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index c99f19482..c5d4b7544 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -127,14 +127,14 @@ The L2 norm of the lexeme's vector representation. | `text` | Verbatim text content. ~~str~~ | | `orth` | ID of the verbatim text content. ~~int~~ | | `orth_` | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~ | -| `rank` | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `rank` | Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `flags` | Container of the lexeme's binary flags. ~~int~~ | -| `norm` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~int~~ | -| `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ | +| `norm` | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~int~~ | +| `norm_` | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~str~~ | | `lower` | Lowercase form of the word. ~~int~~ | | `lower_` | Lowercase form of the word. ~~str~~ | -| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `shape` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | | `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ | | `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ | | `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c15ee7a47..9c15f8797 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -120,14 +120,14 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > matches = matcher(doc) > ``` -| Name | Description | -| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | -| _keyword-only_ | | -| `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | -| `allow_missing` 3 | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ | -| `with_alignments` 3.1 | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~ | -| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | +| Name | Description | +| ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | +| _keyword-only_ | | +| `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | +| `allow_missing` 3 | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ | +| `with_alignments` 3.0.6 | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~ | +| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | ## Matcher.\_\_len\_\_ {#len tag="method" new="2"} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 059040a19..d2dd28ac2 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -61,11 +61,11 @@ shortcut for this and instantiate the component using its string name and > morphologizer = Morphologizer(nlp.vocab, model) > ``` -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ------- | -------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | ## Morphologizer.\_\_call\_\_ {#call tag="method"} @@ -200,14 +200,14 @@ Delegates to [`predict`](/api/morphologizer#predict) and > losses = morphologizer.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Morphologizer.get_loss {#get_loss tag="method"} diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index e64f26bdd..565e520b5 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -98,18 +98,18 @@ representation. > assert f == "Feat1=Val1|Feat2=Val2" > ``` -| Name | Description | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | +| Name | Description | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | | **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | ## Attributes {#attributes} -| Name | Description | -| ------------- | ------------------------------------------------------------------------------------------------------------------------------ | -| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is `|`. ~~str~~ | -| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ | -| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ | +| Name | Description | +| ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ---------- | +| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is ` | `. ~~str~~ | +| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ | +| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ | ## MorphAnalysis {#morphanalysis tag="class" source="spacy/tokens/morphanalysis.pyx"} diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 540476949..4a5fb6042 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -59,7 +59,7 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. | Name | Description | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | | _keyword-only_ | | | `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | @@ -149,8 +149,8 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")] | Name | Description | -| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `match_id` | An ID for the thing you're matching. ~~str~~ | | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | +| `match_id` | An ID for the thing you're matching. ~~str~~ | | | `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ | | _keyword-only_ | | | `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ | diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 7398bae81..ad908f204 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -46,7 +46,10 @@ attribute being scored: - `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc` - `dep_uas`, `dep_las`, `dep_las_per_type` - `ents_p`, `ents_r` `ents_f`, `ents_per_type` -- `textcat_macro_auc`, `textcat_macro_f` +- `cats_score` (depends on config, description provided in `cats_score_desc`), + `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`, + `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`, + `cats_auc_per_type` > #### Example > @@ -77,7 +80,7 @@ Docs with `has_unknown_spaces` are skipped during scoring. > ``` | Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------- | +| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index ce66ecaa4..e82a4bef6 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -187,14 +187,14 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and > losses = senter.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index a377fcf65..75a253fc0 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -28,7 +28,7 @@ how the component should be configured. You can override its settings via the > ``` | Setting | Description | -| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | ```python diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 333344b31..9212f957d 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -491,8 +491,8 @@ document by the `parser`, `senter`, `sentencizer` or some custom function. It will raise an error otherwise. If the span happens to cross sentence boundaries, only the first sentence will -be returned. If it is required that the sentence always includes the -full span, the result can be adjusted as such: +be returned. If it is required that the sentence always includes the full span, +the result can be adjusted as such: ```python sent = span.sent diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md new file mode 100644 index 000000000..57395846d --- /dev/null +++ b/website/docs/api/spancategorizer.md @@ -0,0 +1,474 @@ +--- +title: SpanCategorizer +tag: class,experimental +source: spacy/pipeline/spancat.py +new: 3.1 +teaser: 'Pipeline component for labeling potentially overlapping spans of text' +api_base_class: /api/pipe +api_string_name: spancat +api_trainable: true +--- + +A span categorizer consists of two parts: a [suggester function](#suggesters) +that proposes candidate spans, which may or may not overlap, and a labeler model +that predicts zero or more labels for each candidate. + +## Config and implementation {#config} + +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). See the +[model architectures](/api/architectures) documentation for details on the +architectures and their arguments and hyperparameters. + +> #### Example +> +> ```python +> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL +> config = { +> "threshold": 0.5, +> "spans_key": "labeled_spans", +> "max_positive": None, +> "model": DEFAULT_SPANCAT_MODEL, +> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, +> } +> nlp.add_pipe("spancat", config=config) +> ``` + +| Setting | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[List[Doc], Ragged]~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | + +```python +%%GITHUB_SPACY/spacy/pipeline/spancat.py +``` + +## SpanCategorizer.\_\_init\_\_ {#init tag="method"} + +> #### Example +> +> ```python +> # Construction via add_pipe with default model +> spancat = nlp.add_pipe("spancat") +> +> # Construction via add_pipe with custom model +> config = {"model": {"@architectures": "my_spancat"}} +> parser = nlp.add_pipe("spancat", config=config) +> +> # Construction from class +> from spacy.pipeline import SpanCategorizer +> spancat = SpanCategorizer(nlp.vocab, model, suggester) +> ``` + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#create_pipe). + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[List[Doc], Ragged]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | + +## SpanCategorizer.\_\_call\_\_ {#call tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/spancategorizer#call) and [`pipe`](/api/spancategorizer#pipe) +delegate to the [`predict`](/api/spancategorizer#predict) and +[`set_annotations`](/api/spancategorizer#set_annotations) methods. + +> #### Example +> +> ```python +> doc = nlp("This is a sentence.") +> spancat = nlp.add_pipe("spancat") +> # This usually happens under the hood +> processed = spancat(doc) +> ``` + +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | + +## SpanCategorizer.pipe {#pipe tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/spancategorizer#call) and +[`pipe`](/api/spancategorizer#pipe) delegate to the +[`predict`](/api/spancategorizer#predict) and +[`set_annotations`](/api/spancategorizer#set_annotations) methods. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> for doc in spancat.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | + +## SpanCategorizer.initialize {#initialize tag="method"} + +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, +[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> spancat.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.spancat] +> +> [initialize.components.spancat.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/spancat.json +> ``` + +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | + +## SpanCategorizer.predict {#predict tag="method"} + +Apply the component's model to a batch of [`Doc`](/api/doc) objects without +modifying them. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> scores = spancat.predict([doc1, doc2]) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | + +## SpanCategorizer.set_annotations {#set_annotations tag="method"} + +Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> scores = spancat.predict(docs) +> spancat.set_annotations(docs, scores) +> ``` + +| Name | Description | +| -------- | --------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `SpanCategorizer.predict`. | + +## SpanCategorizer.update {#update tag="method"} + +Learn from a batch of [`Example`](/api/example) objects containing the +predictions and gold-standard annotations, and update the component's model. +Delegates to [`predict`](/api/spancategorizer#predict) and +[`get_loss`](/api/spancategorizer#get_loss). + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> optimizer = nlp.initialize() +> losses = spancat.update(examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + +## SpanCategorizer.get_loss {#get_loss tag="method"} + +Find the loss and gradient of loss for the batch of documents and their +predicted scores. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> scores = spancat.predict([eg.predicted for eg in examples]) +> loss, d_loss = spancat.get_loss(examples, scores) +> ``` + +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + +## SpanCategorizer.score {#score tag="method"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = spancat.score(examples) +> ``` + +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | + +## SpanCategorizer.create_optimizer {#create_optimizer tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> optimizer = spancat.create_optimizer() +> ``` + +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | + +## SpanCategorizer.use_params {#use_params tag="method, contextmanager"} + +Modify the pipe's model to use the given parameter values. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> with spancat.use_params(optimizer.averages): +> spancat.to_disk("/best_model") +> ``` + +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | + +## SpanCategorizer.add_label {#add_label tag="method"} + +Add a new label to the pipe. Raises an error if the output dimension is already +set, or if the model has already been fully [initialized](#initialize). Note +that you don't have to call this method if you provide a **representative data +sample** to the [`initialize`](#initialize) method. In this case, all labels +found in the sample will be automatically added to the model, and the output +dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> spancat.add_label("MY_LABEL") +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | + +## SpanCategorizer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> spancat.to_disk("/path/to/spancat") +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | + +## SpanCategorizer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> spancat.from_disk("/path/to/spancat") +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `SpanCategorizer` object. ~~SpanCategorizer~~ | + +## SpanCategorizer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> spancat = nlp.add_pipe("spancat") +> spancat_bytes = spancat.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `SpanCategorizer` object. ~~bytes~~ | + +## SpanCategorizer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> spancat_bytes = spancat.to_bytes() +> spancat = nlp.add_pipe("spancat") +> spancat.from_bytes(spancat_bytes) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `SpanCategorizer` object. ~~SpanCategorizer~~ | + +## SpanCategorizer.labels {#labels tag="property"} + +The labels currently added to the component. + +> #### Example +> +> ```python +> spancat.add_label("MY_LABEL") +> assert "MY_LABEL" in spancat.labels +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | + +## SpanCategorizer.label_data {#label_data tag="property"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`SpanCategorizer.initialize`](/api/spancategorizer#initialize) to initialize +the model with a pre-defined label set. + +> #### Example +> +> ```python +> labels = spancat.label_data +> spancat.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = spancat.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | + +## Suggesters {#suggesters tag="registered functions" source="spacy/pipeline/spancat.py"} + +### spacy.ngram_suggester.v1 {#ngram_suggester} + +> #### Example Config +> +> ```ini +> [components.spancat.suggester] +> @misc = "spacy.ngram_suggester.v1" +> sizes = [1, 2, 3] +> ``` + +Suggest all spans of the given lengths. Spans are returned as a ragged array of +integers. The array has two columns, indicating the start and end position. + +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------- | +| `sizes` | The phrase lengths to suggest. For example, `[1, 2]` will suggest phrases consisting of 1 or 2 tokens. ~~List[int]~~ | +| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ | + +### spacy.ngram_range_suggester.v1 {#ngram_range_suggester} + +> #### Example Config +> +> ```ini +> [components.spancat.suggester] +> @misc = "spacy.ngram_range_suggester.v1" +> min_size = 2 +> max_size = 4 +> ``` + +Suggest all spans of at least length `min_size` and at most length `max_size` +(both inclusive). Spans are returned as a ragged array of integers. The array +has two columns, indicating the start and end position. + +| Name | Description | +| ----------- | ------------------------------------------------------------ | +| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ | +| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ | +| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 1a4c70522..3002aff7b 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -25,9 +25,9 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx @@ -54,11 +54,11 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | ## Tagger.\_\_call\_\_ {#call tag="method"} @@ -198,14 +198,14 @@ Delegates to [`predict`](/api/tagger#predict) and > losses = tagger.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tagger.rehearse {#rehearse tag="method,experimental" new="3"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index fdd235b85..923da0048 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -10,11 +10,12 @@ api_trainable: true --- The text categorizer predicts **categories over a whole document**. and comes in -two flavours: `textcat` and `textcat_multilabel`. When you need to predict +two flavors: `textcat` and `textcat_multilabel`. When you need to predict exactly one true label per document, use the `textcat` which has mutually exclusive labels. If you want to perform multi-label classification and predict -zero, one or more labels per document, use the `textcat_multilabel` component -instead. +zero, one or more true labels per document, use the `textcat_multilabel` +component instead. For a binary classification task, you can use `textcat` with +**two** labels or `textcat_multilabel` with **one** label. Both components are documented on this page. @@ -68,6 +69,10 @@ architectures and their arguments and hyperparameters. %%GITHUB_SPACY/spacy/pipeline/textcat.py ``` +```python +%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py +``` + ## TextCategorizer.\_\_init\_\_ {#init tag="method"} > #### Example @@ -189,7 +194,7 @@ This method was previously called `begin_training`. | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | -| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | ## TextCategorizer.predict {#predict tag="method"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 90278e8cc..70c352b4d 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -196,14 +196,14 @@ Delegates to [`predict`](/api/tok2vec#predict). > losses = tok2vec.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tok2Vec.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index ecf7bcc8e..44c92d1ee 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -362,8 +362,8 @@ unknown. Defaults to `True` for the first token in the `Doc`. > assert not doc[5].is_sent_start > ``` -| Name | Description | -| ----------- | --------------------------------------------- | +| Name | Description | +| ----------- | ------------------------------------------------------- | | **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ | ## Token.has_vector {#has_vector tag="property" model="vectors"} @@ -420,73 +420,73 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Description | -| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | The parent document. ~~Doc~~ | -| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | -| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | -| `text` | Verbatim text content. ~~str~~ | -| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | -| `whitespace_` | Trailing space character if present. ~~str~~ | -| `orth` | ID of the verbatim text content. ~~int~~ | -| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | -| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | -| `tensor` 2.1.7 | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | -| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | -| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | -| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | -| `i` | The index of the token within the parent document. ~~int~~ | -| `ent_type` | Named entity type. ~~int~~ | -| `ent_type_` | Named entity type. ~~str~~ | -| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | -| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | -| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | -| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | -| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | -| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | -| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | -| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | -| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ | -| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ | -| `lower` | Lowercase form of the token. ~~int~~ | -| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | -| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | -| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | -| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | -| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | -| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | -| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | -| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | -| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | -| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | -| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | -| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | -| `is_punct` | Is the token punctuation? ~~bool~~ | -| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | -| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | -| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | -| `is_bracket` | Is the token a bracket? ~~bool~~ | -| `is_quote` | Is the token a quotation mark? ~~bool~~ | -| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | -| `like_url` | Does the token resemble a URL? ~~bool~~ | -| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | -| `like_email` | Does the token resemble an email address? ~~bool~~ | -| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | -| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | -| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | -| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | -| `tag` | Fine-grained part-of-speech. ~~int~~ | -| `tag_` | Fine-grained part-of-speech. ~~str~~ | -| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | -| `dep` | Syntactic dependency relation. ~~int~~ | -| `dep_` | Syntactic dependency relation. ~~str~~ | -| `lang` | Language of the parent document's vocabulary. ~~int~~ | -| `lang_` | Language of the parent document's vocabulary. ~~str~~ | -| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | -| `idx` | The character offset of the token within the parent document. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | -| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | -| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | -| `cluster` | Brown cluster ID. ~~int~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| Name | Description | +| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | +| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | +| `text` | Verbatim text content. ~~str~~ | +| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | +| `whitespace_` | Trailing space character if present. ~~str~~ | +| `orth` | ID of the verbatim text content. ~~int~~ | +| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | +| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | +| `tensor` 2.1.7 | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | +| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | +| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | +| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | +| `i` | The index of the token within the parent document. ~~int~~ | +| `ent_type` | Named entity type. ~~int~~ | +| `ent_type_` | Named entity type. ~~str~~ | +| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | +| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | +| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | +| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | +| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | +| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | +| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | +| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | +| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ | +| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ | +| `lower` | Lowercase form of the token. ~~int~~ | +| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | +| `shape` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | +| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | +| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | +| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | +| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | +| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | +| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | +| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | +| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | +| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | +| `is_punct` | Is the token punctuation? ~~bool~~ | +| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | +| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | +| `is_bracket` | Is the token a bracket? ~~bool~~ | +| `is_quote` | Is the token a quotation mark? ~~bool~~ | +| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | +| `like_url` | Does the token resemble a URL? ~~bool~~ | +| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | +| `like_email` | Does the token resemble an email address? ~~bool~~ | +| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | +| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | +| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | +| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | +| `tag` | Fine-grained part-of-speech. ~~int~~ | +| `tag_` | Fine-grained part-of-speech. ~~str~~ | +| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | +| `dep` | Syntactic dependency relation. ~~int~~ | +| `dep_` | Syntactic dependency relation. ~~str~~ | +| `lang` | Language of the parent document's vocabulary. ~~int~~ | +| `lang_` | Language of the parent document's vocabulary. ~~str~~ | +| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | +| `idx` | The character offset of the token within the parent document. ~~int~~ | +| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | +| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `cluster` | Brown cluster ID. ~~int~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 5958f2e57..8809c10bc 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -239,6 +239,7 @@ it. | `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ | | `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ | | `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index cfaa75bff..8190d9f78 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -879,7 +879,7 @@ This method was previously available as `spacy.gold.offsets_from_biluo_tags`. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ | -| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | ### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"} @@ -908,7 +908,7 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ | -| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ | ## Utility functions {#util source="spacy/util.py"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 4698529a1..569fcfbd4 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -175,7 +175,7 @@ by [`Language.initialize`](/api/language#initialize). > > ```python > trf = nlp.add_pipe("transformer") -> trf.initialize(lambda: [], nlp=nlp) +> trf.initialize(lambda: iter([]), nlp=nlp) > ``` | Name | Description | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index ba2d5ab42..598abe681 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -290,8 +290,8 @@ If a table is full, it can be resized using ## Vectors.n_keys {#n_keys tag="property"} Get the number of keys in the table. Note that this is the number of _all_ keys, -not just unique vectors. If several keys are mapped to the same -vectors, they will be counted individually. +not just unique vectors. If several keys are mapped to the same vectors, they +will be counted individually. > #### Example > @@ -321,7 +321,7 @@ performed in chunks to avoid consuming too much memory. You can set the > ``` | Name | Description | -| -------------- | --------------------------------------------------------------------------- | +| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | | `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | | _keyword-only_ | | | `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | diff --git a/website/docs/images/huggingface_hub.jpg b/website/docs/images/huggingface_hub.jpg new file mode 100644 index 000000000..5618df020 Binary files /dev/null and b/website/docs/images/huggingface_hub.jpg differ diff --git a/website/docs/images/prodigy_spans-manual.jpg b/website/docs/images/prodigy_spans-manual.jpg new file mode 100644 index 000000000..d67f347e0 Binary files /dev/null and b/website/docs/images/prodigy_spans-manual.jpg differ diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 65f444cd8..92d1b0172 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -27,28 +27,29 @@ of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name into three components: 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with - vocabulary, syntax, entities and word vectors, or `dep` for only vocab and - syntax). + tagging, parsing, lemmatization and named entity recognition, or `dep` for + only tagging, parsing and lemmatization). 2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`. -3. **Size:** Package size indicator, `sm`, `md` or `lg`. +3. **Size:** Package size indicator, `sm`, `md`, `lg` or `trf` (`sm`: no word + vectors, `md`: reduced word vector table with 20k unique vectors for ~500k + words, `lg`: large word vector table with ~500k entries, `trf`: transformer + pipeline without static word vectors) For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English pipeline trained on written web text (blogs, news, comments), that includes -vocabulary, vectors, syntax and entities. +vocabulary, syntax and entities. ### Package versioning {#model-versioning} Additionally, the pipeline package versioning reflects both the compatibility -with spaCy, as well as the major and minor version. A package version `a.b.c` -translates to: +with spaCy, as well as the model version. A package version `a.b.c` translates +to: - `a`: **spaCy major version**. For example, `2` for spaCy v2.x. -- `b`: **Package major version**. Pipelines with a different major version can't - be loaded by the same code. For example, changing the width of the model, - adding hidden layers or changing the activation changes the major version. -- `c`: **Package minor version**. Same pipeline structure, but different - parameter values, e.g. from being trained on different data, for different - numbers of iterations, etc. +- `b`: **spaCy minor version**. For example, `3` for spaCy v2.3.x. +- `c`: **Model version**. Different model config: e.g. from being trained on + different data, with different parameters, for different numbers of + iterations, with different vectors, etc. For a detailed compatibility overview, see the [`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json). @@ -96,9 +97,10 @@ In the `sm`/`md`/`lg` models: tagger. For English, the attribute ruler can improve its mapping from `token.tag` to `token.pos` if dependency parses from a `parser` are present, but the parser is not required. -- The rule-based `lemmatizer` (Dutch, English, French, Greek, Macedonian, - Norwegian and Spanish) requires `token.pos` annotation from either - `tagger`+`attribute_ruler` or `morphologizer`. +- The `lemmatizer` component for many languages (Catalan, Dutch, English, + French, Greek, Italian Macedonian, Norwegian, Polish and Spanish) requires + `token.pos` annotation from either `tagger`+`attribute_ruler` or + `morphologizer`. - The `ner` component is independent with its own internal tok2vec layer. ### Transformer pipeline design {#design-trf} @@ -107,8 +109,6 @@ In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) all listen to the `transformer` component. The `attribute_ruler` and `lemmatizer` have the same configuration as in the CNN models. - - ### Modifying the default pipeline {#design-modify} For faster processing, you may only want to run a subset of the components in a @@ -130,12 +130,13 @@ nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmat nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"]) ``` - + The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for -Dutch, English, French, Greek, Macedonian, Norwegian and Spanish. If you disable -any of these components, you'll see lemmatizer warnings unless the lemmatizer is -also disabled. +Catalan, Dutch, English, French, Greek, Italian, Macedonian, Norwegian, Polish +and Spanish. If you disable any of these components, you'll see lemmatizer +warnings unless the lemmatizer is also disabled. @@ -184,6 +185,12 @@ nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_rule #### Move NER to the end of the pipeline + + +As of v3.1, the NER component is at the end of the pipeline by default. + + + For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the end of the pipeline after `attribute_ruler` and `lemmatizer`: diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 4113e9394..9e3f140e4 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -678,7 +678,7 @@ The following defaults are used for the `[pretraining]` block and merged into your existing config when you run [`init config`](/api/cli#init-config) or [`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed, you can [configure](#pretraining-configure) the settings and hyperparameters or -change the [objective](#pretraining-details). +change the [objective](#pretraining-objectives). ```ini %%GITHUB_SPACY/spacy/default_config_pretraining.cfg @@ -732,7 +732,7 @@ component = "textcat" layer = "tok2vec" ``` -#### Pretraining objectives {#pretraining-details} +#### Pretraining objectives {#pretraining-objectives} > ```ini > ### Characters objective diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 8fe2cf489..17043d599 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -151,7 +151,7 @@ maxout_pieces = 3 depth = 2 [components.textcat.model.linear_model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false @@ -169,7 +169,7 @@ factory = "textcat" labels = [] [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 5a1293c2e..f8f47ab53 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -512,7 +512,7 @@ nlp = spacy.load("en_core_web_sm", disable=["parser"]) spaCy features an extremely fast statistical entity recognition system, that assigns labels to contiguous spans of tokens. The default -[trained pipelines](/models) can indentify a variety of named and numeric +[trained pipelines](/models) can identify a variety of named and numeric entities, including companies, locations, organizations and products. You can add arbitrary classes to the entity recognition system, and update the model with new examples. @@ -550,7 +550,7 @@ on a token, it will return an empty string. > - `I` – Token is **inside** a multi-token entity. > - `L` – Token is the **last** token of a multi-token entity. > - `U` – Token is a single-token **unit** entity. -> - `O` – Toke is **outside** an entity. +> - `O` – Token is **outside** an entity. ```python ### {executable="true"} @@ -1169,7 +1169,20 @@ class WhitespaceTokenizer: def __call__(self, text): words = text.split(" ") - return Doc(self.vocab, words=words) + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) nlp = spacy.blank("en") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) @@ -1248,7 +1261,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that takes the `nlp` object and returns a tokenizer. Here, we're registering a function called `whitespace_tokenizer` in the -[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to +[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to construct your tokenizer during training, you can pass in your Python file by setting `--code functions.py` when you run [`spacy train`](/api/cli#train). @@ -1485,7 +1498,7 @@ that time, the `Doc` will already be tokenized. This process of splitting a token requires more settings, because you need to specify the text of the individual tokens, optional per-token attributes and how -the should be attached to the existing syntax tree. This can be done by +the tokens should be attached to the existing syntax tree. This can be done by supplying a list of `heads` – either the token to attach the newly split token to, or a `(token, subtoken)` tuple if the newly split token should be attached to another subtoken. In this case, "New" should be attached to "York" (the diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index c661c8f15..d1c9a0a81 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -299,6 +299,17 @@ nlp = spacy.load("en_core_web_sm") doc = nlp("This is a sentence.") ``` +If you're in a **Jupyter notebook** or similar environment, you can use the `!` +prefix to +[execute commands](https://ipython.org/ipython-doc/3/interactive/tutorial.html#system-shell-commands). +Make sure to **restart your kernel** or runtime after installation (just like +you would when installing other Python packages) to make sure that the installed +pipeline package can be found. + +```cli +!python -m spacy download en_core_web_sm +``` + ### Installation via pip {#download-pip} To download a trained pipeline directly using @@ -354,6 +365,27 @@ pipeline data. You can place the **pipeline package directory** anywhere on your local file system. +### Installation from Python {#download-python} + +Since the [`spacy download`](/api/cli#download) command installs the pipeline as +a **Python package**, we always recommend running it from the command line, just +like you install other Python packages with `pip install`. However, if you need +to, or if you want to integrate the download process into another CLI command, +you can also import and call the `download` function used by the CLI via Python. + + + +Keep in mind that the `download` command installs a Python package into your +environment. In order for it to be found after installation, you will need to +**restart or reload** your Python process so that new packages are recognized. + + + +```python +import spacy +spacy.cli.download("en_core_web_sm") +``` + ### Using trained pipelines with spaCy {#usage} To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with @@ -367,7 +399,7 @@ the package name or a path to the data directory: > > ```diff > - python -m spacy download en -> + python -m spacy dowmload en_core_web_sm +> + python -m spacy download en_core_web_sm > ``` ```python @@ -382,7 +414,7 @@ doc = nlp("This is a sentence.") You can use the [`info`](/api/cli#info) command or [`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline -packages's meta data before loading it. Each `Language` object with a loaded +package's meta data before loading it. Each `Language` object with a loaded pipeline also exposes the pipeline's meta data as the attribute `meta`. For example, `nlp.meta['version']` will return the package version. @@ -476,6 +508,5 @@ logic around spaCy's loader, you can use [pytest](http://pytest.readthedocs.io/en/latest/)'s [`importorskip()`](https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip) method to only run a test if a specific pipeline package or version is -installed. Each pipeline package exposes a `__version__` attribute which -you can also use to perform your own version compatibility checks before loading -it. +installed. Each pipeline package exposes a `__version__` attribute which you can +also use to perform your own version compatibility checks before loading it. diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index bde3ab84f..0109f24c9 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1105,10 +1105,10 @@ While you could use a registered function or a file loader like [`srsly.read_json.v1`](/api/top-level#file_readers) as an argument of the component factory, this approach is problematic: the component factory runs **every time the component is created**. This means it will run when creating -the `nlp` object before training, but also every a user loads your pipeline. So -your runtime pipeline would either depend on a local path on your file system, -or it's loaded twice: once when the component is created, and then again when -the data is by `from_disk`. +the `nlp` object before training, but also every time a user loads your +pipeline. So your runtime pipeline would either depend on a local path on your +file system, or it's loaded twice: once when the component is created, and then +again when the data is by `from_disk`. > ```ini > ### config.cfg @@ -1324,7 +1324,7 @@ labels = [] # This function is created and then passed to the "textcat" component as # the argument "model" [components.textcat.model] -@architectures = "spacy.TextCatBOW.v1" +@architectures = "spacy.TextCatBOW.v2" exclusive_classes = true ngram_size = 1 no_output_layer = false diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index fc191824a..cb71f361b 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -49,6 +49,7 @@ production. Serve your models and host APIs Distributed and parallel training Track your experiments and results +Upload your pipelines to the Hugging Face Hub ### 1. Clone a project template {#clone} @@ -800,7 +801,7 @@ vars: commands: - name: annotate - script: - - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}' + - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}' - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}' - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy' @@ -1013,3 +1014,68 @@ creating variants of the config for a simple hyperparameter grid search and logging the results. + +--- + +### Hugging Face Hub {#huggingface_hub} + +The [Hugging Face Hub](https://huggingface.co/) lets you upload models and share +them with others. It hosts models as Git-based repositories which are storage +spaces that can contain all your files. It support versioning, branches and +custom metadata out-of-the-box, and provides browser-based visualizers for +exploring your models interactively, as well as an API for production use. The +[`spacy-huggingface-hub`](https://github.com/explosion/spacy-huggingface-hub) +package automatically adds the `huggingface-hub` command to your `spacy` CLI if +it's installed. + +> #### Installation +> +> ```cli +> $ pip install spacy-huggingface-hub +> # Check that the CLI is registered +> $ python -m spacy huggingface-hub --help +> ``` + +You can then upload any pipeline packaged with +[`spacy package`](/api/cli#package). Make sure to set `--build wheel` to output +a binary `.whl` file. The uploader will read all metadata from the pipeline +package, including the auto-generated pretty `README.md` and the model details +available in the `meta.json`. For examples, check out the +[spaCy pipelines](https://huggingface.co/spacy) we've uploaded. + +```cli +$ huggingface-cli login +$ python -m spacy package ./en_ner_fashion ./output --build wheel +$ cd ./output/en_ner_fashion-0.0.0/dist +$ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl +``` + +After uploading, you will see the live URL of your pipeline packages, as well as +the direct URL to the model wheel you can install via `pip install`. You'll also +be able to test your pipeline interactively from your browser: + +![Screenshot: interactive NER visualizer](../images/huggingface_hub.jpg) + +In your `project.yml`, you can add a command that uploads your trained and +packaged pipeline to the hub. You can either run this as a manual step, or +automatically as part of a workflow. Make sure to set `--build wheel` when +running `spacy package` to build a wheel file for your pipeline package. + + +```yaml +### project.yml +- name: "push_to_hub" + help: "Upload the trained model to the Hugging Face Hub" + script: + - "python -m spacy huggingface-hub push packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}-py3-none-any.whl" + deps: + - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}-py3-none-any.whl" +``` + + + +Get started with uploading your models to the Hugging Face hub using our project +template. It trains a simple pipeline, packages it and uploads it if the +packaged model has changed. This makes it easy to deploy your models end-to-end. + + diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 22bf4f470..037850154 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -63,7 +63,7 @@ another token that's at least 10 characters long. spaCy features a rule-matching engine, the [`Matcher`](/api/matcher), that operates over tokens, similar to regular expressions. The rules can refer to -token annotations (e.g. the token `text` or `tag_`, and flags (e.g. `IS_PUNCT`). +token annotations (e.g. the token `text` or `tag_`, and flags like `IS_PUNCT`). The rule matcher also lets you pass in a custom callback to act on matches – for example, to merge entities and apply custom labels. You can also associate patterns with entity IDs, to allow some basic entity linking or disambiguation. @@ -1552,7 +1552,7 @@ doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` -An alternative approach would be to an +An alternative approach would be to use an [extension attribute](/usage/processing-pipelines/#custom-components-attributes) like `._.person_title` and add it to `Span` objects (which includes entity spans in `doc.ents`). The advantage here is that the entity text stays intact and can diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 1b345050c..17fac05e5 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -45,6 +45,14 @@ you generate a starter config with the **recommended settings** for your specific use case. It's also available in spaCy as the [`init config`](/api/cli#init-config) command. + + +Upgrade to the [latest version of spaCy](/usage) to use the quickstart widget. +For earlier releases, follow the CLI instructions to generate a compatible +config. + + + > #### Instructions: widget > > 1. Select your requirements and settings. @@ -635,7 +643,7 @@ The default score weights are defined by each pipeline component via the components are weighted equally. If a score weight is set to `null`, it will be excluded from the logs and the score won't be weighted. - + | Name | Description | | -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | diff --git a/website/docs/usage/v3-1.md b/website/docs/usage/v3-1.md new file mode 100644 index 000000000..1bac8fd81 --- /dev/null +++ b/website/docs/usage/v3-1.md @@ -0,0 +1,320 @@ +--- +title: What's New in v3.1 +teaser: New features and how to upgrade +menu: + - ['New Features', 'features'] + - ['Upgrading Notes', 'upgrading'] +--- + +## New Features {#features hidden="true"} + +It's been great to see the adoption of the new spaCy v3, which introduced +[transformer-based](/usage/embeddings-transformers) pipelines, a new +[config and training system](/usage/training) for reproducible experiments, +[projects](/usage/projects) for end-to-end workflows, and many +[other features](/usage/v3). Version 3.1 adds more on top of it, including the +ability to use predicted annotations during training, a new `SpanCategorizer` +component for predicting arbitrary and potentially overlapping spans, support +for partial incorrect annotations in the entity recognizer, new trained +pipelines for Catalan and Danish, as well as many bug fixes and improvements. + +### Using predicted annotations during training {#predicted-annotations-training} + +By default, components are updated in isolation during training, which means +that they don't see the predictions of any earlier components in the pipeline. +The new +[`[training.annotating_components]`](/usage/training#annotating-components) +config setting lets you specify pipeline components that should set annotations +on the predicted docs during training. This makes it easy to use the predictions +of a previous component in the pipeline as features for a subsequent component, +e.g. the dependency labels in the tagger: + +```ini +### config.cfg (excerpt) {highlight="7,12"} +[nlp] +pipeline = ["parser", "tagger"] + +[components.tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tagger.model.tok2vec.encode.width} +attrs = ["NORM","DEP"] +rows = [5000,2500] +include_static_vectors = false + +[training] +annotating_components = ["parser"] +``` + + + +This project shows how to use the `token.dep` attribute predicted by the parser +as a feature for a subsequent tagger component in the pipeline. + + + +### SpanCategorizer for predicting arbitrary and overlapping spans {#spancategorizer tag="experimental"} + +A common task in applied NLP is extracting spans of texts from documents, +including longer phrases or nested expressions. Named entity recognition isn't +the right tool for this problem, since an entity recognizer typically predicts +single token-based tags that are very sensitive to boundaries. This is effective +for proper nouns and self-contained expressions, but less useful for other types +of phrases or overlapping spans. The new +[`SpanCategorizer`](/api/spancategorizer) component and +[SpanCategorizer](/api/architectures#spancategorizer) architecture let you label +arbitrary and potentially overlapping spans of texts. A span categorizer +consists of two parts: a [suggester function](/api/spancategorizer#suggesters) +that proposes candidate spans, which may or may not overlap, and a labeler model +that predicts zero or more labels for each candidate. The predicted spans are +available via the [`Doc.spans`](/api/doc#spans) container. + + + +This project trains a span categorizer for Indonesian NER. + + + + + +[![Prodigy: example of the new manual spans UI](../images/prodigy_spans-manual.jpg)](https://support.prodi.gy/t/3861) + +The upcoming version of our annotation tool [Prodigy](https://prodi.gy) +(currently available as a [pre-release](https://support.prodi.gy/t/3861) for all +users) features a [new workflow and UI](https://support.prodi.gy/t/3861) for +annotating overlapping and nested spans. You can use it to create training data +for spaCy's `SpanCategorizer` component. + + + +### Update the entity recognizer with partial incorrect annotations {#negative-samples} + +> #### config.cfg (excerpt) +> +> ```ini +> [components.ner] +> factory = "ner" +> incorrect_spans_key = "incorrect_spans" +> moves = null +> update_with_oracle_cut_size = 100 +> ``` + +The [`EntityRecognizer`](/api/entityrecognizer) can now be updated with known +incorrect annotations, which lets you take advantage of partial and sparse data. +For example, you'll be able to use the information that certain spans of text +are definitely **not** `PERSON` entities, without having to provide the complete +gold-standard annotations for the given example. The incorrect span annotations +can be added via the [`Doc.spans`](/api/doc#spans) in the training data under +the key defined as [`incorrect_spans_key`](/api/entityrecognizer#init) in the +component config. + +```python +train_doc = nlp.make_doc("Barack Obama was born in Hawaii.") +# The doc.spans key can be defined in the config +train_doc.spans["incorrect_spans"] = [ + Span(doc, 0, 2, label="ORG"), + Span(doc, 5, 6, label="PRODUCT") +] +``` + + + +### New pipeline packages for Catalan and Danish {#pipeline-packages} + +spaCy v3.1 adds 5 new pipeline packages, including a new core family for Catalan +and a new transformer-based pipeline for Danish using the +[`danish-bert-botxo`](http://huggingface.co/Maltehb/danish-bert-botxo) weights. +See the [models directory](/models) for an overview of all available trained +pipelines and the [training guide](/usage/training) for details on how to train +your own. + +> Thanks to Carlos Rodríguez Penagos and the +> [Barcelona Supercomputing Center](https://temu.bsc.es/) for their +> contributions for Catalan and to Kenneth Enevoldsen for Danish. For additional +> Danish pipelines, check out [DaCy](https://github.com/KennethEnevoldsen/DaCy). + +| Package | Language | UPOS | Parser LAS |  NER F | +| ------------------------------------------------- | -------- | ---: | ---------: | -----: | +| [`ca_core_news_sm`](/models/ca#ca_core_news_sm) | Catalan | 98.2 | 87.4 | 79.8 | +| [`ca_core_news_md`](/models/ca#ca_core_news_md) | Catalan | 98.3 | 88.2 | 84.0 | +| [`ca_core_news_lg`](/models/ca#ca_core_news_lg) | Catalan | 98.5 | 88.4 | 84.2 | +| [`ca_core_news_trf`](/models/ca#ca_core_news_trf) | Catalan | 98.9 | 93.0 | 91.2 | +| [`da_core_news_trf`](/models/da#da_core_news_trf) | Danish | 98.0 | 85.0 | 82.9 | + +### Resizable text classification architectures {#resizable-textcat} + +Previously, the [`TextCategorizer`](/api/textcategorizer) architectures could +not be resized, meaning that you couldn't add new labels to an already trained +model. In spaCy v3.1, the [TextCatCNN](/api/architectures#TextCatCNN) and +[TextCatBOW](/api/architectures#TextCatBOW) architectures are now resizable, +while ensuring that the predictions for the old labels remain the same. + +### CLI command to assemble pipeline from config {#assemble} + +The [`spacy assemble`](/api/cli#assemble) command lets you assemble a pipeline +from a config file without additional training. It can be especially useful for +creating a blank pipeline with a custom tokenizer, rule-based components or word +vectors. + +```cli +$ python -m spacy assemble config.cfg ./output +``` + +### Pretty pipeline package READMEs {#package-readme} + +The [`spacy package`](/api/cli#package) command now auto-generates a pretty +`README.md` based on the pipeline information defined in the `meta.json`. This +includes a table with a general overview, as well as the label scheme and +accuracy figures, if available. For an example, see the +[model releases](https://github.com/explosion/spacy-models/releases). + +### Support for streaming large or infinite corpora {#streaming-corpora} + +> #### config.cfg (excerpt) +> +> ```ini +> [training] +> max_epochs = -1 +> ``` + +The training process now supports streaming large or infinite corpora +out-of-the-box, which can be controlled via the +[`[training.max_epochs]`](/api/data-formats#training) config setting. Setting it +to `-1` means that the train corpus should be streamed rather than loaded into +memory with no shuffling within the training loop. For details on how to +implement a custom corpus loader, e.g. to stream in data from a remote storage, +see the usage guide on +[custom data reading](/usage/training#custom-code-readers-batchers). + +When streaming a corpus, only the first 100 examples will be used for +[initialization](/usage/training#config-lifecycle). This is no problem if you're +training a component like the text classifier with data that specifies all +available labels in every example. If necessary, you can use the +[`init labels`](/api/cli#init-labels) command to pre-generate the labels for +your components using a representative sample so the model can be initialized +correctly before training. + +### New lemmatizers for Catalan and Italian {#pos-lemmatizers} + +The trained pipelines for [Catalan](/models/ca) and [Italian](/models/it) now +include lemmatizers that use the predicted part-of-speech tags as part of the +lookup lemmatization for higher lemmatization accuracy. If you're training your +own pipelines for these languages and you want to include a lemmatizer, make +sure you have the +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package +installed, which provides the relevant tables. + +### Upload your pipelines to the Hugging Face Hub {#huggingface-hub} + +The [Hugging Face Hub](https://huggingface.co/) lets you upload models and share +them with others, and it now supports spaCy pipelines out-of-the-box. The new +[`spacy-huggingface-hub`](https://github.com/explosion/spacy-huggingface-hub) +package automatically adds the `huggingface-hub` command to your `spacy` CLI. It +lets you upload any pipelines packaged with [`spacy package`](/api/cli#package) +and `--build wheel` and takes care of auto-generating all required meta +information. + +After uploading, you'll get a live URL for your model page that includes all +details, files and interactive visualizers, as well as a direct URL to the wheel +file that you can install via `pip install`. For examples, check out the +[spaCy pipelines](https://huggingface.co/spacy) we've uploaded. + +```cli +$ pip install spacy-huggingface-hub +$ huggingface-cli login +$ python -m spacy package ./en_ner_fashion ./output --build wheel +$ cd ./output/en_ner_fashion-0.0.0/dist +$ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl +``` + +You can also integrate the upload command into your +[project template](/usage/projects#huggingface_hub) to automatically upload your +packaged pipelines after training. + + + +Get started with uploading your models to the Hugging Face hub using our project +template. It trains a simple pipeline, packages it and uploads it if the +packaged model has changed. This makes it easy to deploy your models end-to-end. + + + +## Notes about upgrading from v3.0 {#upgrading} + +### Pipeline package version compatibility {#version-compat} + +> #### Using legacy implementations +> +> In spaCy v3, you'll still be able to load and reference legacy implementations +> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the +> components or architectures change and newer versions are available in the +> core library. + +When you're loading a pipeline package trained with spaCy v3.0, you will see a +warning telling you that the pipeline may be incompatible. This doesn't +necessarily have to be true, but we recommend running your pipelines against +your test suite or evaluation data to make sure there are no unexpected results. +If you're using one of the [trained pipelines](/models) we provide, you should +run [`spacy download`](/api/cli#download) to update to the latest version. To +see an overview of all installed packages and their compatibility, you can run +[`spacy validate`](/api/cli#validate). + +If you've trained your own custom pipeline and you've confirmed that it's still +working as expected, you can update the spaCy version requirements in the +[`meta.json`](/api/data-formats#meta): + +```diff +- "spacy_version": ">=3.0.0,<3.1.0", ++ "spacy_version": ">=3.0.0,<3.2.0", +``` + +### Updating v3.0 configs + +To update a config from spaCy v3.0 with the new v3.1 settings, run +[`init fill-config`](/api/cli#init-fill-config): + +```bash +python -m spacy init fill-config config-v3.0.cfg config-v3.1.cfg +``` + +In many cases (`spacy train`, `spacy.load()`), the new defaults will be filled +in automatically, but you'll need to fill in the new settings to run +[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). + +### Sourcing pipeline components with vectors {#source-vectors} + +If you're sourcing a pipeline component that requires static vectors (for +example, a tagger or parser from an `md` or `lg` pretrained pipeline), be sure +to include the source model's vectors in the setting `[initialize.vectors]`. In +spaCy v3.0, a bug allowed vectors to be loaded implicitly through `source`, +however in v3.1 this setting must be provided explicitly as +`[initialize.vectors]`: + +```ini +### config.cfg (excerpt) +[components.ner] +source = "en_core_web_md" + +[initialize] +vectors = "en_core_web_md" +``` + + + +Each pipeline can only store one set of static vectors, so it's not possible to +assemble a pipeline with components that were trained on different static +vectors. + + + +[`spacy train`](/api/cli#train) and [`spacy assemble`](/api/cli#assemble) will +provide warnings if the source and target pipelines don't contain the same +vectors. If you are sourcing a rule-based component like an entity ruler or +lemmatizer that does not use the vectors as a model feature, then this warning +can be safely ignored. + +### Warnings {#warnings} + +Logger warnings have been converted to Python warnings. Use +[`warnings.filterwarnings`](https://docs.python.org/3/library/warnings.html#warnings.filterwarnings) +or the new helper method `spacy.errors.filter_warning(action, error_msg='')` to +manage warnings. diff --git a/website/meta/languages.json b/website/meta/languages.json index b605210c3..2ba117d53 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -269,7 +269,12 @@ }, { "code": "mk", - "name": "Macedonian" + "name": "Macedonian", + "models": [ + "mk_core_news_sm", + "mk_core_news_md", + "mk_core_news_lg" + ] }, { "code": "ml", diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index a7e87ff72..6fe09f052 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -9,7 +9,8 @@ { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "spaCy 101", "url": "/usage/spacy-101" }, - { "text": "New in v3.0", "url": "/usage/v3" } + { "text": "New in v3.0", "url": "/usage/v3" }, + { "text": "New in v3.1", "url": "/usage/v3-1" } ] }, { @@ -94,6 +95,7 @@ { "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" }, { "text": "Sentencizer", "url": "/api/sentencizer" }, + { "text": "SpanCategorizer", "url": "/api/spancategorizer" }, { "text": "Tagger", "url": "/api/tagger" }, { "text": "TextCategorizer", "url": "/api/textcategorizer" }, { "text": "Tok2Vec", "url": "/api/tok2vec" }, @@ -135,9 +137,7 @@ }, { "label": "Legacy", - "items": [ - { "text": "Legacy functions", "url": "/api/legacy" } - ] + "items": [{ "text": "Legacy functions", "url": "/api/legacy" }] } ] } diff --git a/website/meta/universe.json b/website/meta/universe.json index a0183c15d..1f469c33b 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,12 +1,38 @@ { "resources": [ + { + "id": "nlpcloud", + "title": "NLPCloud.io", + "slogan": "Production-ready API for spaCy models in production", + "description": "A highly-available hosted API to easily deploy and use spaCy models in production. Supports NER, POS tagging, dependency parsing, and tokenization.", + "github": "nlpcloud", + "pip": "nlpcloud", + "code_example": [ + "import nlpcloud", + "", + "client = nlpcloud.Client('en_core_web_lg', '4eC39HqLyjWDarjtT1zdp7dc')", + "client.entities('John Doe is a Go Developer at Google')", + "# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]" + ], + "thumb": "https://avatars.githubusercontent.com/u/77671902", + "image": "https://nlpcloud.io/assets/images/logo.svg", + "code_language": "python", + "author": "NLPCloud.io", + "author_links": { + "github": "nlpcloud", + "twitter": "cloud_nlp", + "website": "https://nlpcloud.io" + }, + "category": ["apis", "nonpython", "standalone"], + "tags": ["api", "deploy", "production"] + }, { "id": "denomme", "title": "denomme : Multilingual Name Detector", "slogan": "Multilingual Name Detection", "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone", "github": "meghanabhange/denomme", - "pip": "denomme", + "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz", "code_example": [ "from spacy.lang.xx import MultiLanguage", "from denomme.name import person_name_component", @@ -16,20 +42,48 @@ "print(doc._.person_name)", "# ['Meghana S.R Bhange', 'Asha']" ], + "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png", + "code_language": "python", "author": "Meghana Bhange", "author_links": { - "github": "meghanabhange", - "twitter": "_aspiringcat" + "github": "meghanabhange", + "twitter": "_aspiringcat" }, "category": ["standalone"], "tags": ["person-name-detection"] }, - { + { + "id": "eMFDscore", + "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python", + "slogan": "Extended Moral Foundation Dictionary Scoring for Python", + "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is built on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.", + "github": "medianeuroscience/emfdscore", + "code_example": [ + "from emfdscore.scoring import score_docs", + "import pandas as pd", + "template_input = pd.read_csv('emfdscore/template_input.csv', header=None)", + "DICT_TYPE = 'emfd'", + "PROB_MAP = 'single'", + "SCORE_METHOD = 'bow'", + "OUT_METRICS = 'vice-virtue'", + "OUT_CSV_PATH = 'single-vv.csv'", + "df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)" + ], + "code_language": "python", + "author": "Media Neuroscience Lab", + "author_links": { + "github": "medianeuroscience", + "twitter": "medianeuro" + }, + "category": ["research", "teaching"], + "tags": ["morality", "dictionary", "sentiment"] + }, + { "id": "skweak", "title": "skweak", "slogan": "Weak supervision for NLP", "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.", - "github": "https://github.com/NorskRegnesentral/skweak", + "github": "NorskRegnesentral/skweak", "pip": "skweak", "code_example": [ "import spacy, re", @@ -240,7 +294,7 @@ "", "models = [\"en_core_web_sm\", \"en_core_web_md\"]", "default_text = \"Sundar Pichai is the CEO of Google.\"", - "spacy_streamlit.visualize(models, default_text))" + "spacy_streamlit.visualize(models, default_text)" ], "author": "Ines Montani", "author_links": { @@ -383,10 +437,10 @@ "thumb": "https://i.imgur.com/myhLjMJ.png", "code_example": [ "import stanza", - "from spacy_stanza import StanzaLanguage", + "import spacy_stanza", "", - "snlp = stanza.Pipeline(lang=\"en\")", - "nlp = StanzaLanguage(snlp)", + "stanza.download(\"en\")", + "nlp = spacy_stanza.load_pipeline(\"en\")", "", "doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")", "for token in doc:", @@ -401,6 +455,32 @@ "website": "https://explosion.ai" } }, + { + "id": "spacy-udpipe", + "title": "spacy-udpipe", + "slogan": "Use the latest UDPipe models directly in spaCy", + "description": "This package wraps the fast and efficient UDPipe language-agnostic NLP pipeline (via its Python bindings), so you can use UDPipe pre-trained models as a spaCy pipeline for 50+ languages out-of-the-box. Inspired by spacy-stanza, this package offers slightly less accurate models that are in turn much faster.", + "github": "TakeLab/spacy-udpipe", + "pip": "spacy-udpipe", + "code_example": [ + "import spacy_udpipe", + "", + "spacy_udpipe.download(\"en\") # download English model", + "", + "text = \"Wikipedia is a free online encyclopedia, created and edited by volunteers around the world.\"", + "nlp = spacy_udpipe.load(\"en\")", + "", + "doc = nlp(text)", + "for token in doc:", + " print(token.text, token.lemma_, token.pos_, token.dep_)" + ], + "category": ["pipeline", "standalone", "models", "research"], + "author": "TakeLab", + "author_links": { + "github": "TakeLab", + "website": "https://takelab.fer.hr/" + } + }, { "id": "spacy-server", "title": "spaCy Server", @@ -483,7 +563,7 @@ "trainer = ListTrainer(chatbot)", "trainer.train([", "'Hi, can I help you?',", - "'Sure, I would like to book a flight to Iceland.", + "'Sure, I would like to book a flight to Iceland.',", "'Your flight has been booked.'", "])", "", @@ -533,7 +613,7 @@ "id": "spacymoji", "slogan": "Emoji handling and meta data as a spaCy pipeline component", "github": "ines/spacymoji", - "description": "spaCy v2.0 extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", + "description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", "pip": "spacymoji", "category": ["pipeline"], "tags": ["emoji", "unicode"], @@ -834,6 +914,31 @@ "category": ["pipeline"], "tags": ["lemmatizer", "danish"] }, + { + "id": "dacy", + "title": "DaCy", + "slogan": "An efficient Pipeline for Danish NLP", + "description": "DaCy is a Danish preprocessing pipeline trained in SpaCy. It has achieved State-of-the-Art performance on Named entity recognition, part-of-speech tagging and dependency parsing for Danish. This repository contains material for using the DaCy, reproducing the results and guides on usage of the package. Furthermore, it also contains a series of behavioural test for biases and robustness of Danish NLP pipelines.", + "github": "centre-for-humanities-computing/DaCy", + "pip": "dacy", + "code_example": [ + "import dacy", + "print(dacy.models()) # get a list of dacy models", + "nlp = dacy.load('medium') # load your spacy pipeline", + "", + "# DaCy also includes functionality for adding other Danish models to the pipeline", + "# For instance you can add the BertTone model for classification of sentiment polarity to the pipeline:", + "nlp = add_berttone_polarity(nlp)" + ], + "thumb": "https://github.com/centre-for-humanities-computing/DaCy/blob/main/img/icon_no_title.png?raw=true", + "author": "Centre for Humanities Computing Aarhus", + "author_links": { + "github": "centre-for-humanities-computing", + "website": "https://chcaa.io/#/" + }, + "category": ["pipeline"], + "tags": ["pipeline", "danish"] + }, { "id": "wmd-relax", "slogan": "Calculates word mover's distance insanely fast", @@ -1275,6 +1380,35 @@ }, "category": ["nonpython"] }, + { + "id": "ruby-spacy", + "title": "ruby-spacy", + "slogan": "Wrapper module for using spaCy from Ruby via PyCall", + "description": "ruby-spacy is a wrapper module for using spaCy from the Ruby programming language via PyCall. This module aims to make it easy and natural for Ruby programmers to use spaCy.", + "github": "yohasebe/ruby-spacy", + "code_example": [ + "require \"ruby-spacy\"", + "require \"terminal-table\"", + "nlp = Spacy::Language.new(\"en_core_web_sm\")", + "doc = nlp.read(\"Apple is looking at buying U.K. startup for $1 billion\")", + "headings = [\"text\", \"lemma\", \"pos\", \"tag\", \"dep\"]", + "rows = []", + "doc.each do |token|", + " rows << [token.text, token.lemma, token.pos, token.tag, token.dep]", + "end", + "table = Terminal::Table.new rows: rows, headings: headings", + "puts table" + ], + "code_language": "ruby", + "url": "https://rubygems.org/gems/ruby-spacy", + "author": "Yoichiro Hasebe", + "author_links": { + "github": "yohasebe", + "twitter": "yohasebe" + }, + "category": ["nonpython"], + "tags": ["ruby"] + }, { "id": "spacy_api", "slogan": "Server/client to load models in a separate, dedicated process", @@ -2026,14 +2160,17 @@ "description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)", "github": "recognai/spacy-wordnet", "tags": ["wordnet", "synsets"], - "thumb": "https://i.imgur.com/3y2uPUv.jpg", + "thumb": "https://i.imgur.com/ud4C7cj.png", "code_example": [ "import spacy", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "", "# Load an spacy model (supported models are \"es\" and \"en\") ", "nlp = spacy.load('en')", - "nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", + "# Spacy 3.x", + "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", + "# Spacy 2.x", + "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "token = nlp('prices')[0]", "", "# wordnet object link spacy token with nltk wordnet interface by giving acces to", @@ -2373,8 +2510,7 @@ "from negspacy.negation import Negex", "", "nlp = spacy.load(\"en_core_web_sm\")", - "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])", - "nlp.add_pipe(negex, last=True)", + "nlp.add_pipe(\"negex\", config={\"ent_types\":[\"PERSON\",\"ORG\"]})", "", "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")", "for e in doc.ents:", @@ -2716,10 +2852,10 @@ "pip": "pyate", "code_example": [ "import spacy", - "from pyate.term_extraction_pipeline import TermExtractionPipeline", + "import pyate", "", "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(TermExtractionPipeline())", + "nlp.add_pipe(\"combo_basic\") # or any of `basic`, `weirdness`, `term_extractor` or `cvalue`", "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", "", @@ -3030,6 +3166,85 @@ }, "category": ["research", "standalone", "scientific"], "tags": ["Text Analytics", "Coherence", "Cohesion"] + }, + { + "id": "hmrb", + "title": "Hammurabi", + "slogan": "Python Rule Processing Engine 🏺", + "description": "Hammurabi works as a rule engine to parse input using a defined set of rules. It uses a simple and readable syntax to define complex rules to handle phrase matching. The syntax supports nested logical statements, regular expressions, reusable or side-loaded variables and match triggered callback functions to modularize your rules. The latest version works with both spaCy 2.X and 3.X. For more information check the documentation on [ReadTheDocs](https://hmrb.readthedocs.io/en/latest/).", + "github": "babylonhealth/hmrb", + "pip": "hmrb", + "code_example": [ + "import spacy # __version__ 3.0+", + "from hmrb.core import SpacyCore", + "", + "grammar = \"\"\"", + "Var is_hurting:", + "(", + " optional (lemma: \"be\")", + " (lemma: \"hurt\")", + ")", + "Law:", + " - package: \"headache\"", + " - callback: \"mark_headache\"", + "(", + " (lemma: \"head\", pos: \"NOUN\")", + " $is_hurting", + ")\"\"\"", + "", + "conf = {", + " \"rules\": grammar", + " \"callbacks\": {", + " \"mark_headache\": \"callbacks.headache_handler\",", + " },", + " \"map_doc\": \"augmenters.jsonify_span\",", + " \"sort_length\": True,", + "}", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"hammurabi\", config=conf)", + "nlp(sentences)" + ], + "code_language": "python", + "thumb": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "image": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "author": "Kristian Boda", + "author_links": { + "github": "bodak", + "twitter": "bodak", + "website": "https://github.com/babylonhealth/" + }, + "category": ["pipeline", "standalone", "scientific", "biomedical"], + "tags": ["babylonhealth", "rule-engine", "matcher"] + }, + { + "id": "forte", + "title": "Forte", + "slogan": "Forte is a toolkit for building Natural Language Processing pipelines, featuring cross-task interaction, adaptable data-model interfaces and composable pipelines.", + "description": "Forte provides a platform to assemble state-of-the-art NLP and ML technologies in a highly-composable fashion, including a wide spectrum of tasks ranging from Information Retrieval, Natural Language Understanding to Natural Language Generation.", + "github": "asyml/forte", + "pip": "forte.spacy torch", + "code_example": [ + "from forte.spacy import SpacyProcessor", + "from forte import Pipeline", + "from forte.data.readers import StringReader", + "", + "pipeline = Pipeline()", + "pipeline.set_reader(StringReader())", + "pipeline.add(SpacyProcessor())", + "pipeline.run('Running SpaCy with Forte!')" + ], + "code_language": "python", + "url": "https://medium.com/casl-project/forte-building-modular-and-re-purposable-nlp-pipelines-cf5b5c5abbe9", + "thumb": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/forte_graphic.png", + "image": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/logo_h.png", + "author": "Petuum", + "author_links": { + "twitter": "PetuumInc", + "github": "asyml", + "website": "https://petuum.com" + }, + "category": ["pipeline", "standalone"], + "tags": ["pipeline"] } ], diff --git a/website/src/components/code.js b/website/src/components/code.js index 4dd7a8eb8..6e9f0c22e 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -14,7 +14,7 @@ import GitHubCode from './github' import classes from '../styles/code.module.sass' const WRAP_THRESHOLD = 30 -const CLI_GROUPS = ['init', 'debug', 'project', 'ray'] +const CLI_GROUPS = ['init', 'debug', 'project', 'ray', 'huggingface-hub'] export default props => (
diff --git a/website/src/components/embed.js b/website/src/components/embed.js
index dc25ae079..8d82bfaae 100644
--- a/website/src/components/embed.js
+++ b/website/src/components/embed.js
@@ -13,7 +13,7 @@ const YouTube = ({ id, ratio = '16x9', className }) => {
         [classes.ratio16x9]: ratio === '16x9',
         [classes.ratio4x3]: ratio === '4x3',
     })
-    const url = `https://www.youtube.com/embed/${id}`
+    const url = `https://www.youtube-nocookie.com/embed/${id}`
     return (